From 89da9f8ae22f4d2472939c1cbd620454a3aa1ef3 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Apr 2022 21:50:58 +0100 Subject: [PATCH 01/18] Error on unknown character properties Previously we would form an `.other` character property kind for any unclassified properties, which crash at runtime as unsupported. Instead, switch to erroring on them. Eventually it would be nice if we could version this based on what the runtime being targeted supports. --- Sources/_RegexParser/Regex/AST/Atom.swift | 3 --- .../CharacterPropertyClassification.swift | 9 ++++--- .../Regex/Parse/Diagnostics.swift | 9 ++++--- .../_StringProcessing/ConsumerInterface.swift | 4 --- Tests/RegexTests/ParseTests.swift | 27 ++++++++++--------- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 9cc2e9a96..1f6043d72 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -401,9 +401,6 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) case onigurumaSpecial(OnigurumaSpecialProperty) - - /// Unhandled properties. - case other(key: String?, value: String) } // TODO: erm, separate out or fold into something? splat it in? diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index e5b65a46c..911312121 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -397,8 +397,9 @@ extension Source { return .pcreSpecial(pcreSpecial) } - // Otherwise we don't know what this is. - return .other(key: nil, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: nil, value: value) } static func classifyCharacterProperty( @@ -435,6 +436,8 @@ extension Source { if let match = match { return match } - return .other(key: key, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: key, value: value) } } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 621d6ea11..c3d74c30b 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -57,8 +57,8 @@ enum ParseError: Error, Hashable { case expectedCustomCharacterClassMembers case invalidCharacterClassRangeOperand - case invalidPOSIXSetName(String) case emptyProperty + case unknownProperty(key: String?, value: String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -142,10 +142,13 @@ extension ParseError: CustomStringConvertible { return "expected custom character class members" case .invalidCharacterClassRangeOperand: return "invalid character class range" - case let .invalidPOSIXSetName(n): - return "invalid character set name: '\(n)'" case .emptyProperty: return "empty property" + case .unknownProperty(let key, let value): + if let key = key { + return "unknown character property '\(key)=\(value)'" + } + return "unknown character property '\(value)'" case .expectedGroupSpecifier: return "expected group specifier" case .unbalancedEndOfGroup: diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b49804ca1..3c84195aa 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -423,10 +423,6 @@ extension AST.Atom.CharacterProperty { case .onigurumaSpecial(let s): throw Unsupported("TODO: map Oniguruma special: \(s)") - - case let .other(key, value): - throw Unsupported( - "TODO: map other \(key ?? "")=\(value)") } }() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index bdae250ba..4043e4ccb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -501,7 +501,6 @@ extension RegexTests { parseTest(#"[[:a]]"#, charClass(charClass(":", "a"))) parseTest(#"[[:}]]"#, charClass(charClass(":", "}"))) parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) - parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{")))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) parseTest( @@ -1141,14 +1140,6 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) - // TODO: Start erroring on these? - parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) - parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) - parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y"))) - parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)"))) - parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()")))) - parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}")) - // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter) @@ -2231,12 +2222,12 @@ extension RegexTests { diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) - diagnosticTest(#"\p{a"#, .expected("}")) + diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a")) diagnosticTest(#"\p{a="#, .emptyProperty) diagnosticTest(#"\p{a=}"#, .emptyProperty) - diagnosticTest(#"\p{a=b"#, .expected("}")) - diagnosticTest(#"\p{aaa[b]}"#, .expected("}")) - diagnosticTest(#"\p{a=b=c}"#, .expected("}")) + diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2321,6 +2312,16 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) + // MARK: Character properties + + diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx")) + diagnosticTest(#"\p{gcL}"#, .unknownProperty(key: nil, value: "gcL")) + diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) + diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) + diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 3f161701c9f76a2e18cb3c47ee95453911337b83 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Apr 2022 11:46:59 +0100 Subject: [PATCH 02/18] Don't parse a character property containing a backslash Add backslash to the list of characters we don't consider valid for a character property name. This means that we'll bail when attempting to lex a POSIX character property and instead lex a custom character class. This allows e.g `[:\Q :] \E]` to be lexed as a custom character class. For `\p{...}` this just means we'll emit a truncated invalid property error, which is arguably more inline with what the user was expecting.. I noticed when digging through the ICU source code that it will bail out of parsing a POSIX character property if it encounters one of its known escape sequences (e.g `\a`, `\e`, `\f`, ...). Interestingly this doesn't cover character property escapes e.g `\d`, but it's not clear that is intentional. Given backslash is not a valid character property character anyway, it seems reasonable to broaden this behavior to bail on any backslash. --- .../Regex/Parse/LexicalAnalysis.swift | 8 +++++++ Tests/RegexTests/ParseTests.swift | 21 ++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index e8b7e9e18..6a61ccdf7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1176,6 +1176,14 @@ extension Source { // character property name anyway, and it's nice not to have diverging // logic for these cases. return true + case "\\": + // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here + // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It + // seems character class escapes e.g '\d' are excluded, however it's not + // clear that is intentional. Let's apply the rule for any escape, as a + // backslash would never be a valid character property name, and we can + // diagnose any invalid escapes when parsing as a character class. + return true default: // We may want to handle other metacharacters here, e.g '{', '(', ')', // as they're not valid character property names. However for now diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 4043e4ccb..94c134853 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,25 @@ extension RegexTests { parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( + #"[:[:space:]:]"#, + charClass(":", posixProp_m(.binary(.whitespace)), ":") + ) + parseTest( + #"[:a[:space:]b:]"#, + charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":") + ) + + // ICU parses a custom character class if it sees any of its known escape + // sequences in a POSIX character property (though it appears to exclude + // character class escapes e.g '\d'). We do so for any escape sequence as + // '\' is not a valid character property character. + parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]"))) + parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":")) + parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":")) + parseTest(#"[:\\:]"#, charClass(":", "\\", ":")) + parseTest(#"[:\:]"#, charClass(":", ":")) + parseTest( #"\D\S\W"#, concat( @@ -2319,7 +2338,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: #"aaa\p{b"#)) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) // MARK: Matching options From fa5f2f1c48e96bffc51ebad54fc48f7acc0ab0be Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Apr 2022 13:00:40 +0100 Subject: [PATCH 03/18] Update Regex Syntax document for `[:...:]` changes Clarify that `[:...:]` may be used outside of a custom character class, and discuss the character class disambiguation behavior. --- .../RegexSyntaxRunTimeConstruction.md | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md index cab21288d..3fb0841e3 100644 --- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md @@ -392,7 +392,7 @@ For non-Unicode properties, only a value is required. These include: - The special PCRE2 properties `Xan`, `Xps`, `Xsp`, `Xuc`, `Xwd`. - The special Java properties `javaLowerCase`, `javaUpperCase`, `javaWhitespace`, `javaMirrored`. -Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. +Note that the internal `PropertyContents` syntax is shared by both the `\p{...}` and POSIX-style `[:...:]` syntax, allowing e.g `[:script=Latin:]` as well as `\p{alnum}`. Both spellings may be used inside and outside of a custom character class. #### `\K` @@ -534,6 +534,7 @@ These operators have a lower precedence than the implicit union of members, e.g To avoid ambiguity between .NET's subtraction syntax and range syntax, .NET specifies that a subtraction will only be parsed if the right-hand-side is a nested custom character class. We propose following this behavior. +Note that a custom character class may begin with the `:` character, and only becomes a POSIX character property if a closing `:]` is present. For example, `[:a]` is the character class of `:` and `a`. ### Matching options @@ -863,7 +864,23 @@ PCRE supports `\N` meaning "not a newline", however there are engines that treat ### Extended character property syntax -ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`, such that they follow the same internal grammar, which allows referencing any Unicode character property in addition to the POSIX properties. We propose supporting this, though it is a purely additive feature, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. +ICU unifies the character property syntax `\p{...}` with the syntax for POSIX character classes `[:...:]`. This has two effects: + +- They share the same internal grammar, which allows the use of any Unicode character properties in addition to the POSIX properties. +- The POSIX syntax may be used outside of custom character classes, unlike in PCRE and Oniguruma. + +We propose following both of these rules. The former is purely additive, and therefore should not conflict with regex engines that implement a more limited POSIX syntax. The latter does conflict with other engines, but we feel it is much more likely that a user would expect e.g `[:space:]` to be a character property rather than the character class `[:aceps]`. We do however feel that a warning might be warranted in order to avoid confusion. + +### POSIX character property disambiguation + +PCRE, Oniguruma and ICU allow `[:` to be part of a custom character class if a closing `:]` is not present. For example, `[:a]` is the character class of `:` and `a`. However they each have different rules for detecting the closing `:]`: + +- PCRE will scan ahead until it hits either `:]`, `]`, or `[:`. +- Oniguruma will scan ahead until it hits either `:]`, `]`, or the length exceeds 20 characters. +- ICU will scan ahead until it hits a known escape sequence (e.g `\a`, `\e`, `\Q`, ...), or `:]`. Note this excludes character class escapes e.g `\d`. It also excludes `]`, meaning that even `[:a][:]` is parsed as a POSIX character property. + +We propose unifying these behaviors by scanning ahead until we hit either `[`, `]`, `:]`, or `\`. Additionally, we will stop on encountering `}` or a second occurrence of `=`. These fall out the fact that they would be invalid contents of the alternative `\p{...}` syntax. + ### Script properties From 9ccde19174d60a4075d6529a844e0e2a1e204960 Mon Sep 17 00:00:00 2001 From: Richard Wei Date: Tue, 19 Apr 2022 09:11:43 -0700 Subject: [PATCH 04/18] Support obtaining captures by name on `AnyRegexOutput` (#300) Resolves #266. --- Sources/_StringProcessing/ByteCodeGen.swift | 4 ++-- .../_StringProcessing/Engine/MEBuilder.swift | 12 ++++++++++-- .../_StringProcessing/Engine/MECapture.swift | 1 + .../_StringProcessing/Engine/MEProgram.swift | 1 + Sources/_StringProcessing/Executor.swift | 4 +++- .../Regex/AnyRegexOutput.swift | 19 +++++++++++++++++-- Sources/_StringProcessing/Regex/Match.swift | 3 +++ Tests/RegexBuilderTests/RegexDSLTests.swift | 13 +++++++++---- 8 files changed, 46 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index b6f9b4732..86309bb8a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -587,11 +587,11 @@ extension Compiler.ByteCodeGen { try emitConcatenationComponent(child) } - case let .capture(_, refId, child): + case let .capture(name, refId, child): options.beginScope() defer { options.endScope() } - let cap = builder.makeCapture(id: refId) + let cap = builder.makeCapture(id: refId, name: name) switch child { case let .matcher(_, m): emitMatcher(m, into: cap) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 7cf94f6ef..2b38ace0a 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -45,6 +45,7 @@ extension MEProgram where Input.Element: Hashable { // Symbolic reference resolution var unresolvedReferences: [ReferenceID: [InstructionAddress]] = [:] var referencedCaptureOffsets: [ReferenceID: Int] = [:] + var namedCaptureOffsets: [String: Int] = [:] var captureCount: Int { // We currently deduce the capture count from the capture register number. nextCaptureRegister.rawValue @@ -353,7 +354,8 @@ extension MEProgram.Builder { staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, captureStructure: captureStructure, - referencedCaptureOffsets: referencedCaptureOffsets) + referencedCaptureOffsets: referencedCaptureOffsets, + namedCaptureOffsets: namedCaptureOffsets) } mutating func reset() { self = Self() } @@ -438,7 +440,9 @@ fileprivate extension MEProgram.Builder { // Register helpers extension MEProgram.Builder { - mutating func makeCapture(id: ReferenceID?) -> CaptureRegister { + mutating func makeCapture( + id: ReferenceID?, name: String? + ) -> CaptureRegister { defer { nextCaptureRegister.rawValue += 1 } // Register the capture for later lookup via symbolic references. if let id = id { @@ -446,6 +450,10 @@ extension MEProgram.Builder { captureCount, forKey: id) assert(preexistingValue == nil) } + if let name = name { + // TODO: Reject duplicate capture names unless `(?J)`? + namedCaptureOffsets.updateValue(captureCount, forKey: name) + } return nextCaptureRegister } diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index 390af7d66..807598637 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -145,6 +145,7 @@ extension Processor._StoredCapture: CustomStringConvertible { struct CaptureList { var values: Array._StoredCapture> var referencedCaptureOffsets: [ReferenceID: Int] + var namedCaptureOffsets: [String: Int] // func extract(from s: String) -> Array> { // caps.map { $0.map { s[$0] } } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index b0f2e6a79..0bfa0ecba 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -36,6 +36,7 @@ struct MEProgram where Input.Element: Equatable { let captureStructure: CaptureStructure let referencedCaptureOffsets: [ReferenceID: Int] + let namedCaptureOffsets: [String: Int] } extension MEProgram: CustomStringConvertible { diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index c7d4527a5..6ebb93f5c 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -37,7 +37,8 @@ struct Executor { let capList = CaptureList( values: cpu.storedCaptures, - referencedCaptureOffsets: engine.program.referencedCaptureOffsets) + referencedCaptureOffsets: engine.program.referencedCaptureOffsets, + namedCaptureOffsets: engine.program.namedCaptureOffsets) let capStruct = engine.program.captureStructure let range = inputRange.lowerBound.. Substring { input[range] } + + public subscript(name: String) -> AnyRegexOutput.Element? { + namedCaptureOffsets[name].map { self[$0 + 1] } + } } /// A type-erased regex output @available(SwiftStdlib 5.7, *) public struct AnyRegexOutput { let input: String + let namedCaptureOffsets: [String: Int] fileprivate let _elements: [ElementRepresentation] /// The underlying representation of the element of a type-erased regex @@ -94,9 +99,12 @@ extension AnyRegexOutput { @available(SwiftStdlib 5.7, *) extension AnyRegexOutput { internal init( - input: String, elements: C + input: String, namedCaptureOffsets: [String: Int], elements: C ) where C.Element == StructuredCapture { - self.init(input: input, _elements: elements.map(ElementRepresentation.init)) + self.init( + input: input, + namedCaptureOffsets: namedCaptureOffsets, + _elements: elements.map(ElementRepresentation.init)) } } @@ -170,6 +178,13 @@ extension AnyRegexOutput: RandomAccessCollection { } } +@available(SwiftStdlib 5.7, *) +extension AnyRegexOutput { + public subscript(name: String) -> Element? { + namedCaptureOffsets[name].map { self[$0 + 1] } + } +} + @available(SwiftStdlib 5.7, *) extension Regex.Match where Output == AnyRegexOutput { /// Creates a type-erased regex match from an existing match. diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index a86899041..4b2f117e4 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -26,6 +26,8 @@ extension Regex { let referencedCaptureOffsets: [ReferenceID: Int] + let namedCaptureOffsets: [String: Int] + let value: Any? } } @@ -40,6 +42,7 @@ extension Regex.Match { storedCapture: StoredCapture(range: range, value: nil)) let output = AnyRegexOutput( input: input, + namedCaptureOffsets: namedCaptureOffsets, elements: [wholeMatchAsCapture] + rawCaptures) return output as! Output } else if Output.self == Substring.self { diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 897bca8f7..58f847f32 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -689,7 +689,9 @@ class RegexDSLTests: XCTestCase { } do { let regex = try Regex( - compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#) + compiling: #""" + (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* + """#) let line = """ A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ COMBINING MARK TUKWENTIS @@ -699,13 +701,16 @@ class RegexDSLTests: XCTestCase { let output = match.output XCTAssertEqual(output[0].substring, line[...]) XCTAssertTrue(output[1].substring == "A6F0") + XCTAssertTrue(output["lower"]?.substring == "A6F0") XCTAssertTrue(output[2].substring == "A6F1") + XCTAssertTrue(output["upper"]?.substring == "A6F1") XCTAssertTrue(output[3].substring == "Extend") + XCTAssertTrue(output["desc"]?.substring == "Extend") let typedOutput = try XCTUnwrap(output.as( - (Substring, Substring, Substring?, Substring).self)) + (Substring, lower: Substring, upper: Substring?, Substring).self)) XCTAssertEqual(typedOutput.0, line[...]) - XCTAssertTrue(typedOutput.1 == "A6F0") - XCTAssertTrue(typedOutput.2 == "A6F1") + XCTAssertTrue(typedOutput.lower == "A6F0") + XCTAssertTrue(typedOutput.upper == "A6F1") XCTAssertTrue(typedOutput.3 == "Extend") } } From 182da3bb462055f50e9b67aae626d43fe70025a6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 19 Apr 2022 11:33:11 -0500 Subject: [PATCH 05/18] Untangle `_RegexParser` from `RegexBuilder` (#299) This makes the changes necessary for _RegexParser to be imported as an implementation-only dependency. The change provides _StringProcessing wrappers for all `AST` types that need to be publicly visible via SPI, and a DSLTree.Node wrapper for internal conformance to _TreeNode. Co-authored-by: Richard Wei --- Sources/RegexBuilder/Anchor.swift | 37 +- Sources/RegexBuilder/CharacterClass.swift | 61 +-- Sources/RegexBuilder/DSL.swift | 12 +- Sources/RegexBuilder/Variadics.swift | 45 ++- .../VariadicsGenerator.swift | 5 +- Sources/_StringProcessing/ByteCodeGen.swift | 20 +- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 17 +- .../Regex/ASTConversion.swift | 20 +- .../Regex/DSLConsumers.swift | 3 +- Sources/_StringProcessing/Regex/DSLTree.swift | 373 ++++++++++++++---- Sources/_StringProcessing/Regex/Options.swift | 2 +- .../_CharacterClassModel.swift | 26 +- Tests/RegexBuilderTests/CustomTests.swift | 15 +- 14 files changed, 399 insertions(+), 239 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 55b554aea..e8cd4ac54 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _RegexParser +@_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing @available(SwiftStdlib 5.7, *) @@ -31,34 +31,21 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var astAssertion: AST.Atom.AssertionKind { - if !isInverted { - switch kind { - case .startOfSubject: return .startOfSubject - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline - case .endOfSubject: return .endOfSubject - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject - case .textSegmentBoundary: return .textSegment - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine - case .wordBoundary: return .wordBoundary - } - } else { - switch kind { - case .startOfSubject: fatalError("Not yet supported") - case .endOfSubjectBeforeNewline: fatalError("Not yet supported") - case .endOfSubject: fatalError("Not yet supported") - case .firstMatchingPositionInSubject: fatalError("Not yet supported") - case .textSegmentBoundary: return .notTextSegment - case .startOfLine: fatalError("Not yet supported") - case .endOfLine: fatalError("Not yet supported") - case .wordBoundary: return .notWordBoundary - } + var baseAssertion: DSLTree._AST.AssertionKind { + switch kind { + case .startOfSubject: return .startOfSubject(isInverted) + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) + case .endOfSubject: return .endOfSubject(isInverted) + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) + case .textSegmentBoundary: return .textSegmentBoundary(isInverted) + case .startOfLine: return .startOfLine(isInverted) + case .endOfLine: return .endOfLine(isInverted) + case .wordBoundary: return .wordBoundary(isInverted) } } public var regex: Regex { - Regex(node: .atom(.assertion(astAssertion))) + Regex(node: .atom(.assertion(baseAssertion))) } } diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index d163c336b..0087d734a 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _RegexParser +@_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing @available(SwiftStdlib 5.7, *) @@ -21,19 +21,10 @@ public struct CharacterClass { } init(unconverted model: _CharacterClassModel) { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch model.makeAST() { - case .atom(let atom): - self.ccc = .init(members: [.atom(.unconverted(atom))]) - default: - fatalError("Unsupported _CharacterClassModel") + guard let ccc = model.makeDSLTreeCharacterClass() else { + fatalError("Unsupported character class") } - } - - init(property: AST.Atom.CharacterProperty) { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - let astAtom = AST.Atom(.property(property), .fake) - self.ccc = .init(members: [.atom(.unconverted(astAtom))]) + self.ccc = ccc } } @@ -119,11 +110,7 @@ extension RegexComponent where Self == CharacterClass { @available(SwiftStdlib 5.7, *) extension CharacterClass { public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass { - guard let extendedCategory = category.extendedGeneralCategory else { - fatalError("Unexpected general category") - } - return CharacterClass(property: - .init(.generalCategory(extendedCategory), isInverted: false, isPOSIX: false)) + return CharacterClass(.generalCategory(category)) } } @@ -144,44 +131,6 @@ public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass { return CharacterClass(ccc) } -extension Unicode.GeneralCategory { - var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { - switch self { - case .uppercaseLetter: return .uppercaseLetter - case .lowercaseLetter: return .lowercaseLetter - case .titlecaseLetter: return .titlecaseLetter - case .modifierLetter: return .modifierLetter - case .otherLetter: return .otherLetter - case .nonspacingMark: return .nonspacingMark - case .spacingMark: return .spacingMark - case .enclosingMark: return .enclosingMark - case .decimalNumber: return .decimalNumber - case .letterNumber: return .letterNumber - case .otherNumber: return .otherNumber - case .connectorPunctuation: return .connectorPunctuation - case .dashPunctuation: return .dashPunctuation - case .openPunctuation: return .openPunctuation - case .closePunctuation: return .closePunctuation - case .initialPunctuation: return .initialPunctuation - case .finalPunctuation: return .finalPunctuation - case .otherPunctuation: return .otherPunctuation - case .mathSymbol: return .mathSymbol - case .currencySymbol: return .currencySymbol - case .modifierSymbol: return .modifierSymbol - case .otherSymbol: return .otherSymbol - case .spaceSeparator: return .spaceSeparator - case .lineSeparator: return .lineSeparator - case .paragraphSeparator: return .paragraphSeparator - case .control: return .control - case .format: return .format - case .surrogate: return .surrogate - case .privateUse: return .privateUse - case .unassigned: return .unassigned - @unknown default: return nil - } - } -} - // MARK: - Set algebra methods @available(SwiftStdlib 5.7, *) diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 86ec0bee5..97bc35154 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -import _RegexParser +@_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing @available(SwiftStdlib 5.7, *) @@ -105,7 +105,7 @@ public struct QuantificationBehavior { var kind: Kind - internal var astKind: AST.Quantification.Kind { + internal var astKind: DSLTree._AST.QuantificationKind { switch kind { case .eagerly: return .eager case .reluctantly: return .reluctant @@ -136,13 +136,13 @@ extension DSLTree.Node { return .quantification(.oneOrMore, kind, node) case _ where range.count == 1: // ..<1 or ...0 or any range with count == 1 // Note: `behavior` is ignored in this case - return .quantification(.exactly(.init(faking: range.lowerBound)), .default, node) + return .quantification(.exactly(range.lowerBound), .default, node) case (0, _): // 0.. 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -720,7 +719,7 @@ extension Repeat { ) where RegexOutput == Substring { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -835,7 +834,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -845,7 +844,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -958,7 +957,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -968,7 +967,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1081,7 +1080,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1091,7 +1090,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1204,7 +1203,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1214,7 +1213,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1327,7 +1326,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1337,7 +1336,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1450,7 +1449,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1460,7 +1459,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1573,7 +1572,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1583,7 +1582,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1696,7 +1695,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1706,7 +1705,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1819,7 +1818,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1829,7 +1828,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1942,7 +1941,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -1952,7 +1951,7 @@ extension Repeat { ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } @available(SwiftStdlib 5.7, *) diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift index d1cb41810..50f09700e 100644 --- a/Sources/VariadicsGenerator/VariadicsGenerator.swift +++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift @@ -121,7 +121,6 @@ struct VariadicsGenerator: ParsableCommand { // BEGIN AUTO-GENERATED CONTENT - import _RegexParser @_spi(RegexBuilder) import _StringProcessing @@ -490,7 +489,7 @@ struct VariadicsGenerator: ParsableCommand { ) \(params.whereClauseForInit) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component.regex.root)) + self.init(node: .quantification(.exactly(count), .default, component.regex.root)) } \(defaultAvailableAttr) @@ -501,7 +500,7 @@ struct VariadicsGenerator: ParsableCommand { ) \(params.whereClauseForInit) { assert(count > 0, "Must specify a positive count") // TODO: Emit a warning about `repeatMatch(count: 0)` or `repeatMatch(count: 1)` - self.init(node: .quantification(.exactly(.init(faking: count)), .default, component().regex.root)) + self.init(node: .quantification(.exactly(count), .default, component().regex.root)) } \(defaultAvailableAttr) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 86309bb8a..621538ebe 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -26,22 +26,22 @@ extension Compiler.ByteCodeGen { try emitScalar(s) case let .assertion(kind): - try emitAssertion(kind) + try emitAssertion(kind.ast) case let .backreference(ref): - try emitBackreference(ref) + try emitBackreference(ref.ast) case let .symbolicReference(id): builder.buildUnresolvedReference(id: id) case let .changeMatchingOptions(optionSequence): - options.apply(optionSequence) + options.apply(optionSequence.ast) case let .unconverted(astAtom): - if let consumer = try astAtom.generateConsumer(options) { + if let consumer = try astAtom.ast.generateConsumer(options) { builder.buildConsume(by: consumer) } else { - throw Unsupported("\(astAtom._patternBase)") + throw Unsupported("\(astAtom.ast._patternBase)") } } } @@ -370,9 +370,9 @@ extension Compiler.ByteCodeGen { let updatedKind: AST.Quantification.Kind switch kind { case .explicit(let kind): - updatedKind = kind + updatedKind = kind.ast case .syntax(let kind): - updatedKind = kind.applying(options) + updatedKind = kind.ast.applying(options) case .default: updatedKind = options.isReluctantByDefault ? .reluctant @@ -604,13 +604,13 @@ extension Compiler.ByteCodeGen { } case let .nonCapturingGroup(kind, child): - try emitNoncapturingGroup(kind, child) + try emitNoncapturingGroup(kind.ast, child) case .conditional: throw Unsupported("Conditionals") case let .quantification(amt, kind, child): - try emitQuantification(amt, kind, child) + try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): if ccc.containsAny { @@ -646,7 +646,7 @@ extension Compiler.ByteCodeGen { } case let .regexLiteral(l): - try emitNode(l.dslTreeNode) + try emitNode(l.ast.dslTreeNode) case let .convertedRegexLiteral(n, _): try emitNode(n) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index f77cd322f..356b7cc4b 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -105,7 +105,7 @@ extension DSLTree.Atom { return nil case let .unconverted(a): - return try a.generateConsumer(opts) + return try a.ast.generateConsumer(opts) } } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4d135898b..91626eb5c 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -68,7 +68,7 @@ extension PrettyPrinter { private mutating func printAsPattern( convertedFromAST node: DSLTree.Node ) { - if patternBackoff(node) { + if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) return } @@ -90,7 +90,7 @@ extension PrettyPrinter { } case let .nonCapturingGroup(kind, child): - let kind = kind._patternBase + let kind = kind.ast._patternBase printBlock("Group(\(kind))") { printer in printer.printAsPattern(convertedFromAST: child) } @@ -108,8 +108,8 @@ extension PrettyPrinter { print("/* TODO: conditional */") case let .quantification(amount, kind, child): - let amount = amount._patternBase - let kind = kind._patternBase + let amount = amount.ast._patternBase + let kind = (kind.ast ?? .eager)._patternBase printBlock("\(amount)(\(kind))") { printer in printer.printAsPattern(convertedFromAST: child) } @@ -129,7 +129,7 @@ extension PrettyPrinter { case let .unconverted(a): // TODO: is this always right? // TODO: Convert built-in character classes - print(a._patternBase) + print(a.ast._patternBase) case .assertion: print("/* TODO: assertions */") @@ -400,11 +400,6 @@ extension AST.Quantification.Kind { extension DSLTree.QuantificationKind { var _patternBase: String { - switch self { - case .explicit(let kind), .syntax(let kind): - return kind._patternBase - case .default: - return ".eager" - } + (ast ?? .eager)._patternBase } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 8acbd3b1b..ef98a7b8f 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -40,7 +40,7 @@ extension AST.Node { // TODO: Should we do this for the // single-concatenation child too, or should? // we wrap _that_? - return .convertedRegexLiteral(node, self) + return .convertedRegexLiteral(node, .init(ast: self)) } // Convert the top-level node without wrapping @@ -111,19 +111,19 @@ extension AST.Node { case .balancedCapture: throw Unsupported("TODO: balanced captures") default: - return .nonCapturingGroup(v.kind.value, child) + return .nonCapturingGroup(.init(ast: v.kind.value), child) } case let .conditional(v): let trueBranch = v.trueBranch.dslTreeNode let falseBranch = v.falseBranch.dslTreeNode return .conditional( - v.condition.kind, trueBranch, falseBranch) + .init(ast: v.condition.kind), trueBranch, falseBranch) case let .quantification(v): let child = v.child.dslTreeNode return .quantification( - v.amount.value, .syntax(v.kind.value), child) + .init(ast: v.amount.value), .syntax(.init(ast: v.kind.value)), child) case let .quote(v): return .quotedLiteral(v.literal) @@ -140,9 +140,9 @@ extension AST.Node { case .empty(_): return .empty - case let .absentFunction(a): + case let .absentFunction(abs): // TODO: What should this map to? - return .absentFunction(a) + return .absentFunction(.init(ast: abs)) } } @@ -202,20 +202,20 @@ extension AST.CustomCharacterClass { extension AST.Atom { var dslTreeAtom: DSLTree.Atom { if let kind = assertionKind { - return .assertion(kind) + return .assertion(.init(ast: kind)) } switch self.kind { case let .char(c): return .char(c) case let .scalar(s): return .scalar(s) case .any: return .any - case let .backreference(r): return .backreference(r) - case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) + case let .backreference(r): return .backreference(.init(ast: r)) + case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) case .escaped(let c) where c.scalarValue != nil: return .scalar(c.scalarValue!) - default: return .unconverted(self) + default: return .unconverted(.init(ast: self)) } } } diff --git a/Sources/_StringProcessing/Regex/DSLConsumers.swift b/Sources/_StringProcessing/Regex/DSLConsumers.swift index ea46c789b..eb8ace8d3 100644 --- a/Sources/_StringProcessing/Regex/DSLConsumers.swift +++ b/Sources/_StringProcessing/Regex/DSLConsumers.swift @@ -21,8 +21,7 @@ public protocol CustomMatchingRegexComponent: RegexComponent { @available(SwiftStdlib 5.7, *) extension CustomMatchingRegexComponent { public var regex: Regex { - - let node: DSLTree.Node = .matcher(.init(RegexOutput.self), { input, index, bounds in + let node: DSLTree.Node = .matcher(RegexOutput.self, { input, index, bounds in try match(input, startingAt: index, in: bounds) }) return Regex(node: node) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 51f5ea36f..ce5beeaca 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -24,7 +24,7 @@ public struct DSLTree { extension DSLTree { @_spi(RegexBuilder) - public indirect enum Node: _TreeNode { + public indirect enum Node { /// Try to match each node in order /// /// ... | ... | ... @@ -42,7 +42,7 @@ extension DSLTree { name: String? = nil, reference: ReferenceID? = nil, Node) /// Match a (non-capturing) subpattern / group - case nonCapturingGroup(AST.Group.Kind, Node) + case nonCapturingGroup(_AST.GroupKind, Node) // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -52,10 +52,10 @@ extension DSLTree { /// (?(cond) true-branch | false-branch) /// case conditional( - AST.Conditional.Condition.Kind, Node, Node) + _AST.ConditionKind, Node, Node) case quantification( - AST.Quantification.Amount, + _AST.QuantificationAmount, QuantificationKind, Node) @@ -74,19 +74,19 @@ extension DSLTree { case quotedLiteral(String) /// An embedded literal - case regexLiteral(AST.Node) + case regexLiteral(_AST.ASTNode) // TODO: What should we do here? /// /// TODO: Consider splitting off expression functions, or have our own kind - case absentFunction(AST.AbsentFunction) + case absentFunction(_AST.AbsentFunction) // MARK: - Tree conversions /// The target of AST conversion. /// /// Keeps original AST around for rich syntactic and source information - case convertedRegexLiteral(Node, AST.Node) + case convertedRegexLiteral(Node, _AST.ASTNode) // MARK: - Extensibility points @@ -95,7 +95,7 @@ extension DSLTree { case consumer(_ConsumerInterface) - case matcher(AnyType, _MatcherInterface) + case matcher(Any.Type, _MatcherInterface) // TODO: Would this just boil down to a consumer? case characterPredicate(_CharacterPredicateInterface) @@ -108,9 +108,17 @@ extension DSLTree { /// The default quantification kind, as set by options. case `default` /// An explicitly chosen kind, overriding any options. - case explicit(AST.Quantification.Kind) + case explicit(_AST.QuantificationKind) /// A kind set via syntax, which can be affected by options. - case syntax(AST.Quantification.Kind) + case syntax(_AST.QuantificationKind) + + var ast: AST.Quantification.Kind? { + switch self { + case .default: return nil + case .explicit(let kind), .syntax(let kind): + return kind.ast + } + } } @_spi(RegexBuilder) @@ -134,6 +142,12 @@ extension DSLTree { self.isInverted = isInverted } + public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { + let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false) + let astAtom = AST.Atom(.property(property), .fake) + return .init(members: [.atom(.unconverted(.init(ast: astAtom)))]) + } + public var inverted: CustomCharacterClass { var result = self result.isInverted.toggle() @@ -162,13 +176,51 @@ extension DSLTree { case scalar(Unicode.Scalar) case any - case assertion(AST.Atom.AssertionKind) - case backreference(AST.Reference) + case assertion(_AST.AssertionKind) + case backreference(_AST.Reference) case symbolicReference(ReferenceID) - case changeMatchingOptions(AST.MatchingOptionSequence) + case changeMatchingOptions(_AST.MatchingOptionSequence) + + case unconverted(_AST.Atom) + } +} - case unconverted(AST.Atom) +extension Unicode.GeneralCategory { + var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { + switch self { + case .uppercaseLetter: return .uppercaseLetter + case .lowercaseLetter: return .lowercaseLetter + case .titlecaseLetter: return .titlecaseLetter + case .modifierLetter: return .modifierLetter + case .otherLetter: return .otherLetter + case .nonspacingMark: return .nonspacingMark + case .spacingMark: return .spacingMark + case .enclosingMark: return .enclosingMark + case .decimalNumber: return .decimalNumber + case .letterNumber: return .letterNumber + case .otherNumber: return .otherNumber + case .connectorPunctuation: return .connectorPunctuation + case .dashPunctuation: return .dashPunctuation + case .openPunctuation: return .openPunctuation + case .closePunctuation: return .closePunctuation + case .initialPunctuation: return .initialPunctuation + case .finalPunctuation: return .finalPunctuation + case .otherPunctuation: return .otherPunctuation + case .mathSymbol: return .mathSymbol + case .currencySymbol: return .currencySymbol + case .modifierSymbol: return .modifierSymbol + case .otherSymbol: return .otherSymbol + case .spaceSeparator: return .spaceSeparator + case .lineSeparator: return .lineSeparator + case .paragraphSeparator: return .paragraphSeparator + case .control: return .control + case .format: return .format + case .surrogate: return .surrogate + case .privateUse: return .privateUse + case .unassigned: return .unassigned + @unknown default: return nil + } } } @@ -226,8 +278,8 @@ extension DSLTree.Node { .customCharacterClass, .atom: return [] - case let .absentFunction(a): - return a.children.map(\.dslTreeNode) + case let .absentFunction(abs): + return abs.ast.children.map(\.dslTreeNode) } } } @@ -235,8 +287,8 @@ extension DSLTree.Node { extension DSLTree.Node { var astNode: AST.Node? { switch self { - case let .regexLiteral(ast): return ast - case let .convertedRegexLiteral(_, ast): return ast + case let .regexLiteral(literal): return literal.ast + case let .convertedRegexLiteral(_, literal): return literal.ast default: return nil } } @@ -280,9 +332,9 @@ extension DSLTree.Node { case .capture: return true case let .regexLiteral(re): - return re.hasCapture + return re.ast.hasCapture case let .convertedRegexLiteral(n, re): - assert(n.hasCapture == re.hasCapture) + assert(n.hasCapture == re.ast.hasCapture) return n.hasCapture default: @@ -295,70 +347,15 @@ extension DSLTree { var captureStructure: CaptureStructure { // TODO: nesting var constructor = CaptureStructure.Constructor(.flatten) - return root._captureStructure(&constructor) + return _Tree(root)._captureStructure(&constructor) } } extension DSLTree.Node { - @_spi(RegexBuilder) - public func _captureStructure( - _ constructor: inout CaptureStructure.Constructor - ) -> CaptureStructure { - switch self { - case let .orderedChoice(children): - return constructor.alternating(children) - - case let .concatenation(children): - return constructor.concatenating(children) - - case let .capture(name, _, child): - if let type = child.valueCaptureType { - return constructor.capturing( - name: name, child, withType: type) - } - return constructor.capturing(name: name, child) - - case let .nonCapturingGroup(kind, child): - assert(!kind.isCapturing) - return constructor.grouping(child, as: kind) - - case let .conditional(cond, trueBranch, falseBranch): - return constructor.condition( - cond, - trueBranch: trueBranch, - falseBranch: falseBranch) - - case let .quantification(amount, _, child): - return constructor.quantifying( - child, amount: amount) - - case let .regexLiteral(re): - // TODO: Force a re-nesting? - return re._captureStructure(&constructor) - - case let .absentFunction(abs): - return constructor.absent(abs.kind) - - case let .convertedRegexLiteral(n, _): - // TODO: Switch nesting strategy? - return n._captureStructure(&constructor) - - case .matcher: - return .empty - - case .transform(_, let child): - return child._captureStructure(&constructor) - - case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: - return .empty - } - } - /// For typed capture-producing nodes, the type produced. var valueCaptureType: AnyType? { switch self { case let .matcher(t, _): - return t + return AnyType(t) case let .transform(t, _): return AnyType(t.resultType) default: return nil @@ -455,3 +452,225 @@ public struct CaptureTransform: Hashable, CustomStringConvertible { "" } } + +// MARK: AST wrapper types +// +// These wrapper types are required because even @_spi-marked public APIs can't +// include symbols from implementation-only dependencies. + +extension DSLTree { + /// Presents a wrapped version of `DSLTree.Node` that can provide an internal + /// `_TreeNode` conformance. + struct _Tree: _TreeNode { + var node: DSLTree.Node + + init(_ node: DSLTree.Node) { + self.node = node + } + + var children: [_Tree]? { + switch node { + + case let .orderedChoice(v): return v.map(_Tree.init) + case let .concatenation(v): return v.map(_Tree.init) + + case let .convertedRegexLiteral(n, _): + // Treat this transparently + return _Tree(n).children + + case let .capture(_, _, n): return [_Tree(n)] + case let .nonCapturingGroup(_, n): return [_Tree(n)] + case let .transform(_, n): return [_Tree(n)] + case let .quantification(_, _, n): return [_Tree(n)] + + case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] + + case .trivia, .empty, .quotedLiteral, .regexLiteral, + .consumer, .matcher, .characterPredicate, + .customCharacterClass, .atom: + return [] + + case let .absentFunction(abs): + return abs.ast.children.map(\.dslTreeNode).map(_Tree.init) + } + } + + func _captureStructure( + _ constructor: inout CaptureStructure.Constructor + ) -> CaptureStructure { + switch node { + case let .orderedChoice(children): + return constructor.alternating(children.map(_Tree.init)) + + case let .concatenation(children): + return constructor.concatenating(children.map(_Tree.init)) + + case let .capture(name, _, child): + if let type = child.valueCaptureType { + return constructor.capturing( + name: name, _Tree(child), withType: type) + } + return constructor.capturing(name: name, _Tree(child)) + + case let .nonCapturingGroup(kind, child): + assert(!kind.ast.isCapturing) + return constructor.grouping(_Tree(child), as: kind.ast) + + case let .conditional(cond, trueBranch, falseBranch): + return constructor.condition( + cond.ast, + trueBranch: _Tree(trueBranch), + falseBranch: _Tree(falseBranch)) + + case let .quantification(amount, _, child): + return constructor.quantifying( + Self(child), amount: amount.ast) + + case let .regexLiteral(re): + // TODO: Force a re-nesting? + return re.ast._captureStructure(&constructor) + + case let .absentFunction(abs): + return constructor.absent(abs.ast.kind) + + case let .convertedRegexLiteral(n, _): + // TODO: Switch nesting strategy? + return Self(n)._captureStructure(&constructor) + + case .matcher: + return .empty + + case .transform(_, let child): + return Self(child)._captureStructure(&constructor) + + case .customCharacterClass, .atom, .trivia, .empty, + .quotedLiteral, .consumer, .characterPredicate: + return .empty + } + } + } + + @_spi(RegexBuilder) + public enum _AST { + @_spi(RegexBuilder) + public struct GroupKind { + internal var ast: AST.Group.Kind + + public static var atomicNonCapturing: Self { + .init(ast: .atomicNonCapturing) + } + public static var lookahead: Self { + .init(ast: .lookahead) + } + public static var negativeLookahead: Self { + .init(ast: .negativeLookahead) + } + } + + @_spi(RegexBuilder) + public struct ConditionKind { + internal var ast: AST.Conditional.Condition.Kind + } + + @_spi(RegexBuilder) + public struct QuantificationKind { + internal var ast: AST.Quantification.Kind + + public static var eager: Self { + .init(ast: .eager) + } + public static var reluctant: Self { + .init(ast: .reluctant) + } + public static var possessive: Self { + .init(ast: .possessive) + } + } + + @_spi(RegexBuilder) + public struct QuantificationAmount { + internal var ast: AST.Quantification.Amount + + public static var zeroOrMore: Self { + .init(ast: .zeroOrMore) + } + public static var oneOrMore: Self { + .init(ast: .oneOrMore) + } + public static var zeroOrOne: Self { + .init(ast: .zeroOrOne) + } + public static func exactly(_ n: Int) -> Self { + .init(ast: .exactly(.init(faking: n))) + } + public static func nOrMore(_ n: Int) -> Self { + .init(ast: .nOrMore(.init(faking: n))) + } + public static func upToN(_ n: Int) -> Self { + .init(ast: .upToN(.init(faking: n))) + } + public static func range(_ lower: Int, _ upper: Int) -> Self { + .init(ast: .range(.init(faking: lower), .init(faking: upper))) + } + } + + @_spi(RegexBuilder) + public struct ASTNode { + internal var ast: AST.Node + } + + @_spi(RegexBuilder) + public struct AbsentFunction { + internal var ast: AST.AbsentFunction + } + + @_spi(RegexBuilder) + public struct AssertionKind { + internal var ast: AST.Atom.AssertionKind + + public static func startOfSubject(_ inverted: Bool = false) -> Self { + .init(ast: .startOfSubject) + } + public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { + .init(ast: .endOfSubjectBeforeNewline) + } + public static func endOfSubject(_ inverted: Bool = false) -> Self { + .init(ast: .endOfSubject) + } + public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { + .init(ast: .firstMatchingPositionInSubject) + } + public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { + inverted + ? .init(ast: .notTextSegment) + : .init(ast: .textSegment) + } + public static func startOfLine(_ inverted: Bool = false) -> Self { + .init(ast: .startOfLine) + } + public static func endOfLine(_ inverted: Bool = false) -> Self { + .init(ast: .endOfLine) + } + public static func wordBoundary(_ inverted: Bool = false) -> Self { + inverted + ? .init(ast: .notWordBoundary) + : .init(ast: .wordBoundary) + } + } + + @_spi(RegexBuilder) + public struct Reference { + internal var ast: AST.Reference + } + + @_spi(RegexBuilder) + public struct MatchingOptionSequence { + internal var ast: AST.MatchingOptionSequence + } + + @_spi(RegexBuilder) + public struct Atom { + internal var ast: AST.Atom + } + } +} diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 623589b54..a93421f4f 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -195,6 +195,6 @@ extension RegexComponent { ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( - .changeMatchingOptions(sequence), regex.root)) + .init(ast: .changeMatchingOptions(sequence)), regex.root)) } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c9762f00e..2debcda9d 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -28,7 +28,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + public enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -52,10 +52,14 @@ public struct _CharacterClassModel: Hashable { case custom([CharacterSetComponent]) } - public typealias SetOperator = AST.CustomCharacterClass.SetOp + public enum SetOperator: Hashable { + case subtraction + case intersection + case symmetricDifference + } /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { + public struct SetOperation: Hashable { var lhs: CharacterSetComponent var op: SetOperator var rhs: CharacterSetComponent @@ -72,7 +76,7 @@ public struct _CharacterClassModel: Hashable { } } - public enum CharacterSetComponent: Hashable { + public enum CharacterSetComponent: Hashable { case character(Character) case range(ClosedRange) @@ -294,7 +298,17 @@ extension _CharacterClassModel: CustomStringConvertible { } extension _CharacterClassModel { - public func makeAST() -> AST.Node? { + public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { + // FIXME: Implement in DSLTree instead of wrapping an AST atom + switch makeAST() { + case .atom(let atom): + return .init(members: [.atom(.unconverted(.init(ast: atom)))]) + default: + return nil + } + } + + internal func makeAST() -> AST.Node? { let inv = isInverted func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { @@ -375,7 +389,7 @@ extension DSLTree.Atom { var characterClass: _CharacterClassModel? { switch self { case let .unconverted(a): - return a.characterClass + return a.ast.characterClass default: return nil } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index 0a7d6fc59..bf4489a68 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -136,7 +136,7 @@ func customTest( class CustomRegexComponentTests: XCTestCase { // TODO: Refactor below into more exhaustive, declarative // tests. - func testCustomRegexComponents() { + func testCustomRegexComponents() throws { customTest( Regex { Numbler() @@ -178,14 +178,13 @@ class CustomRegexComponentTests: XCTestCase { } } - guard let res3 = "ab123c".firstMatch(of: regex3) else { - XCTFail() - return - } + let str = "ab123c" + let res3 = try XCTUnwrap(str.firstMatch(of: regex3)) - XCTAssertEqual(res3.range, "ab123c".index(atOffset: 2)..<"ab123c".index(atOffset: 5)) - XCTAssertEqual(res3.output.0, "123") - XCTAssertEqual(res3.output.1, "123") + let expectedSubstring = str.dropFirst(2).prefix(3) + XCTAssertEqual(res3.range, expectedSubstring.startIndex.. Date: Tue, 19 Apr 2022 13:50:10 -0700 Subject: [PATCH 06/18] Expose `matches`, `ranges` and `split` (#304) * Expose `matches`, `ranges` and `split` Publicize these API per the String Processing Algorithms proposal. The proposed ones return generic `Collection`, empowered by SE-0346. For now we'll wrap the results with a concrete `Array` until the language feature is ready. Co-authored-by: Michael Ilseman --- .../Algorithms/Algorithms/Ranges.swift | 29 +++++++++++++-- .../Algorithms/Algorithms/Split.swift | 35 +++++++++++++------ .../Algorithms/Matching/MatchReplace.swift | 2 +- .../Algorithms/Matching/Matches.swift | 15 ++++---- Tests/RegexBuilderTests/AlgorithmsTests.swift | 12 +++---- Tests/RegexTests/AlgorithmsTests.swift | 6 ++-- 6 files changed, 69 insertions(+), 30 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index f1861fcf2..853c73271 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -175,12 +175,24 @@ extension BidirectionalCollection { // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { - // FIXME: Replace `RangesCollection` when SE-0346 is enabled func ranges( of other: S ) -> RangesCollection> where S.Element == Element { ranges(of: ZSearcher(pattern: Array(other), by: ==)) } + + // FIXME: Return `some Collection>` for SE-0346 + /// Finds and returns the ranges of the all occurrences of a given sequence + /// within the collection. + /// - Parameter other: The sequence to search for. + /// - Returns: A collection of ranges of all occurrences of `other`. Returns + /// an empty collection if `other` is not found. + @available(SwiftStdlib 5.7, *) + public func ranges( + of other: S + ) -> [Range] where S.Element == Element { + ranges(of: ZSearcher(pattern: Array(other), by: ==)).map { $0 } + } } extension BidirectionalCollection where Element: Equatable { @@ -217,8 +229,8 @@ extension BidirectionalCollection where Element: Comparable { // MARK: Regex algorithms extension BidirectionalCollection where SubSequence == Substring { - // FIXME: Replace `RangesCollection` when SE-0346 is enabled @available(SwiftStdlib 5.7, *) + @_disfavoredOverload func ranges( of regex: R ) -> RangesCollection> { @@ -231,4 +243,17 @@ extension BidirectionalCollection where SubSequence == Substring { ) -> ReversedRangesCollection> { rangesFromBack(of: RegexConsumer(regex)) } + + // FIXME: Return `some Collection>` for SE-0346 + /// Finds and returns the ranges of the all occurrences of a given sequence + /// within the collection. + /// - Parameter regex: The regex to search for. + /// - Returns: A collection or ranges in the receiver of all occurrences of + /// `regex`. Returns an empty collection if `regex` is not found. + @available(SwiftStdlib 5.7, *) + public func ranges( + of regex: R + ) -> [Range] { + Array(ranges(of: RegexConsumer(regex))) + } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift index 485bc3b7f..8c7a9832d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Split.swift @@ -233,16 +233,24 @@ extension BidirectionalCollection where Element: Equatable { // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { - // FIXME: Replace `SplitCollection` when SE-0346 is enabled + @_disfavoredOverload + func split( + by separator: S + ) -> SplitCollection> where S.Element == Element { + split(by: ZSearcher(pattern: Array(separator), by: ==)) + } + + // FIXME: Return `some Collection` for SE-0346 /// Returns the longest possible subsequences of the collection, in order, /// around elements equal to the given separator. /// - Parameter separator: The element to be split upon. /// - Returns: A collection of subsequences, split from this collection's /// elements. - func split( + @available(SwiftStdlib 5.7, *) + public func split( by separator: S - ) -> SplitCollection> where S.Element == Element { - split(by: ZSearcher(pattern: Array(separator), by: ==)) + ) -> [SubSequence] where S.Element == Element { + Array(split(by: ZSearcher(pattern: Array(separator), by: ==))) } } @@ -282,12 +290,7 @@ extension BidirectionalCollection where Element: Comparable { @available(SwiftStdlib 5.7, *) extension BidirectionalCollection where SubSequence == Substring { - // FIXME: Replace `SplitCollection` when SE-0346 is enabled - /// Returns the longest possible subsequences of the collection, in order, - /// around elements equal to the given separator. - /// - Parameter separator: A regex describing elements to be split upon. - /// - Returns: A collection of substrings, split from this collection's - /// elements. + @_disfavoredOverload func split( by separator: R ) -> SplitCollection> { @@ -299,4 +302,16 @@ extension BidirectionalCollection where SubSequence == Substring { ) -> ReversedSplitCollection> { splitFromBack(by: RegexConsumer(separator)) } + + // FIXME: Return `some Collection` for SE-0346 + /// Returns the longest possible subsequences of the collection, in order, + /// around elements equal to the given separator. + /// - Parameter separator: A regex describing elements to be split upon. + /// - Returns: A collection of substrings, split from this collection's + /// elements. + public func split( + by separator: R + ) -> [SubSequence] { + Array(split(by: RegexConsumer(separator))) + } } diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift index 8485182de..09e021a29 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift @@ -139,7 +139,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { var result = Self() result.append(contentsOf: self[..( of regex: R ) -> MatchesCollection> { @@ -202,10 +199,14 @@ extension BidirectionalCollection where SubSequence == Substring { matchesFromBack(of: RegexConsumer(regex)) } - // FIXME: Replace the returned value as `some Collection.Match> - // when SE-0346 is enabled + // FIXME: Return `some Collection.Match> for SE-0346 + /// Returns a collection containing all matches of the specified regex. + /// - Parameter regex: The regex to search for. + /// - Returns: A collection of matches of `regex`. @available(SwiftStdlib 5.7, *) - func _matches(of r: R) -> [Regex.Match] { + public func matches( + of r: R + ) -> [Regex.Match] { let slice = self[...] var start = self.startIndex let end = self.endIndex diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index cf117690a..793054cd1 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -15,13 +15,11 @@ import _StringProcessing @available(SwiftStdlib 5.7, *) class RegexConsumerTests: XCTestCase { - // FIXME: enable this test when we update the return type of `matches(of:)` - // when SE-0346 is available - // func testMatches() { - // let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } - // let str = "foo 160 bar 99 baz" - // XCTAssertEqual(str.matches(of: regex).map(\.result.1), [320, 198]) - // } + func testMatches() { + let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } + let str = "foo 160 bar 99 baz" + XCTAssertEqual(str.matches(of: regex).map(\.output.1), [320, 198]) + } func testMatchReplace() { func replaceTest( diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 8e77a8977..a7832a0f9 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -163,13 +163,13 @@ class RegexConsumerTests: XCTestCase { XCTAssertEqual(s2.replacing(regex, with: ""), "") XCTAssertEqual( - s._matches(of: regex).map(\.0), + s.matches(of: regex).map(\.0), ["aaa", "aaaaaa", "aaaaaaaaaa"]) XCTAssertEqual( - s1._matches(of: regex).map(\.0), + s1.matches(of: regex).map(\.0), ["aaaaaa", "aaaaaaaaaa"]) XCTAssertEqual( - s2._matches(of: regex).map(\.0), + s2.matches(of: regex).map(\.0), ["aa"]) } } From 15355bfd33d82b4eb5f379ed0cd903f5a5e61fcb Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 19 Apr 2022 15:51:23 -0600 Subject: [PATCH 07/18] Convenience quoting (#305) --- Sources/_StringProcessing/Regex/Match.swift | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 4b2f117e4..45b177867 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -187,3 +187,10 @@ extension Substring { try? r.regex.prefixMatch(in: self) } } + +@available(SwiftStdlib 5.7, *) +extension Regex { + public init(quoting string: String) { + self.init(node: .quotedLiteral(string)) + } +} From 46b9a0fcee56cbb3040ce29be727db322be76bf2 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Wed, 20 Apr 2022 16:38:21 -0600 Subject: [PATCH 08/18] Remove compiling argument label (#306) --- Documentation/Evolution/RegexLiterals.md | 4 ++-- .../Evolution/RegexSyntaxRunTimeConstruction.md | 4 ++-- Documentation/Evolution/RegexTypeOverview.md | 6 +++--- Sources/Exercises/Participants/RegexParticipant.swift | 2 +- Sources/_StringProcessing/Regex/AnyRegexOutput.swift | 4 ++-- Sources/_StringProcessing/Regex/Core.swift | 2 +- Tests/RegexBuilderTests/MotivationTests.swift | 4 ++-- Tests/RegexBuilderTests/RegexDSLTests.swift | 6 +++--- Tests/RegexTests/AlgorithmsTests.swift | 10 +++++----- Tests/RegexTests/MatchTests.swift | 10 +++++----- 10 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Documentation/Evolution/RegexLiterals.md b/Documentation/Evolution/RegexLiterals.md index 3c12c9c7a..3643590d4 100644 --- a/Documentation/Evolution/RegexLiterals.md +++ b/Documentation/Evolution/RegexLiterals.md @@ -12,7 +12,7 @@ In *[Regex Type and Overview][regex-type]* we introduced the `Regex` type, which ```swift let pattern = #"(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)"# -let regex = try! Regex(compiling: pattern) +let regex = try! Regex(pattern) // regex: Regex ``` @@ -366,7 +366,7 @@ However we decided against this because: ### No custom literal -Instead of adding a custom regex literal, we could require users to explicitly write `try! Regex(compiling: "[abc]+")`. This would be similar to `NSRegularExpression`, and loses all the benefits of parsing the literal at compile time. This would mean: +Instead of adding a custom regex literal, we could require users to explicitly write `try! Regex("[abc]+")`. This would be similar to `NSRegularExpression`, and loses all the benefits of parsing the literal at compile time. This would mean: - No source tooling support (e.g syntax highlighting, refactoring actions) would be available. - Parse errors would be diagnosed at run time rather than at compile time. diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md index 3fb0841e3..1a868aa04 100644 --- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md @@ -50,11 +50,11 @@ We propose run-time construction of `Regex` from a best-in-class treatment of fa ```swift let pattern = #"(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)"# -let regex = try! Regex(compiling: pattern) +let regex = try! Regex(pattern) // regex: Regex let regex: Regex<(Substring, Substring, Substring, Substring, Substring)> = - try! Regex(compiling: pattern) + try! Regex(pattern) ``` ### Syntax diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md index bce336551..9fd369dbf 100644 --- a/Documentation/Evolution/RegexTypeOverview.md +++ b/Documentation/Evolution/RegexTypeOverview.md @@ -134,11 +134,11 @@ Regexes can be created at run time from a string containing familiar regex synta ```swift let pattern = #"(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)"# -let regex = try! Regex(compiling: pattern) +let regex = try! Regex(pattern) // regex: Regex let regex: Regex<(Substring, Substring, Substring, Substring, Substring)> = - try! Regex(compiling: pattern) + try! Regex(pattern) ``` *Note*: The syntax accepted and further details on run-time compilation, including `AnyRegexOutput` and extended syntaxes, are discussed in [Run-time Regex Construction][pitches]. @@ -300,7 +300,7 @@ Regex targets [UTS\#18 Level 2](https://www.unicode.org/reports/tr18/#Extended_U ```swift /// A regex represents a string processing algorithm. /// -/// let regex = try Regex(compiling: "a(.*)b") +/// let regex = try Regex("a(.*)b") /// let match = "cbaxb".firstMatch(of: regex) /// print(match.0) // "axb" /// print(match.1) // "x" diff --git a/Sources/Exercises/Participants/RegexParticipant.swift b/Sources/Exercises/Participants/RegexParticipant.swift index 6c53b3adf..627f9583b 100644 --- a/Sources/Exercises/Participants/RegexParticipant.swift +++ b/Sources/Exercises/Participants/RegexParticipant.swift @@ -70,7 +70,7 @@ private func graphemeBreakPropertyDataLiteral( forLine line: String ) -> GraphemeBreakEntry? { let regex = try! Regex( - compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, + #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, as: (Substring, Substring, Substring?, Substring).self) return graphemeBreakPropertyData(forLine: line, using: regex) } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 8dacb5d50..0d018aa81 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -14,7 +14,7 @@ @available(SwiftStdlib 5.7, *) extension Regex where Output == AnyRegexOutput { /// Parse and compile `pattern`, resulting in an existentially-typed capture list. - public init(compiling pattern: String) throws { + public init(_ pattern: String) throws { self.init(ast: try parse(pattern, .traditional)) } } @@ -23,7 +23,7 @@ extension Regex where Output == AnyRegexOutput { extension Regex { /// Parse and compile `pattern`, resulting in a strongly-typed capture list. public init( - compiling pattern: String, + _ pattern: String, as: Output.Type = Output.self ) throws { self.init(ast: try parse(pattern, .traditional)) diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 56a14da51..d77784df4 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -21,7 +21,7 @@ public protocol RegexComponent { /// A regex represents a string processing algorithm. /// -/// let regex = try Regex(compiling: "a(.*)b") +/// let regex = try Regex("a(.*)b") /// let match = "cbaxb".firstMatch(of: regex) /// print(match.0) // "axb" /// print(match.1) // "x" diff --git a/Tests/RegexBuilderTests/MotivationTests.swift b/Tests/RegexBuilderTests/MotivationTests.swift index 1927b9ae4..22e790e2d 100644 --- a/Tests/RegexBuilderTests/MotivationTests.swift +++ b/Tests/RegexBuilderTests/MotivationTests.swift @@ -139,7 +139,7 @@ private func processWithRuntimeDynamicRegex( _ line: String ) -> Transaction? { // FIXME: Shouldn't this init throw? - let regex = try! Regex(compiling: pattern) + let regex = try! Regex(pattern) // guard let result = line.match(regex) else { return nil } // @@ -156,7 +156,7 @@ private func processWithRuntimeDynamicRegex( @available(macOS 12.0, *) private func processWithRuntimeStaticRegex(_ line: String) -> Transaction? { let regex: Regex<(Substring, Substring, Substring, Substring, Substring)> - = try! Regex(compiling: pattern) + = try! Regex(pattern) return process(line, using: regex) } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 58f847f32..6d74de826 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -666,7 +666,7 @@ class RegexDSLTests: XCTestCase { do { let regexLiteral = try Regex( - compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, + #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, as: (Substring, Substring, Substring?, Substring).self) let maybeMatchResult = line.wholeMatch(of: regexLiteral) let matchResult = try XCTUnwrap(maybeMatchResult) @@ -680,7 +680,7 @@ class RegexDSLTests: XCTestCase { func testDynamicCaptures() throws { do { - let regex = try Regex(compiling: "aabcc.") + let regex = try Regex("aabcc.") let line = "aabccd" let match = try XCTUnwrap(line.wholeMatch(of: regex)) XCTAssertEqual(match.0, line[...]) @@ -689,7 +689,7 @@ class RegexDSLTests: XCTestCase { } do { let regex = try Regex( - compiling: #""" + #""" (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* """#) let line = """ diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index a7832a0f9..a788ad13c 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -32,7 +32,7 @@ class RegexConsumerTests: XCTestCase { _ expected: [Range], file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(compiling: regex) + let regex = try! Regex(regex) let actualSeq: [Range] = string[...].ranges(of: regex).map(string.offsets(of:)) XCTAssertEqual(actualSeq, expected, file: file, line: line) @@ -69,7 +69,7 @@ class RegexConsumerTests: XCTestCase { _ expected: [Substring], file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(compiling: regex) + let regex = try! Regex(regex) let actual = Array(string.split(by: regex)) XCTAssertEqual(actual, expected, file: file, line: line) } @@ -89,7 +89,7 @@ class RegexConsumerTests: XCTestCase { _ expected: String, file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(compiling: regex) + let regex = try! Regex(regex) let actual = string.replacing(regex, with: replacement) XCTAssertEqual(actual, expected, file: file, line: line) } @@ -108,7 +108,7 @@ class RegexConsumerTests: XCTestCase { } func testAdHoc() { - let r = try! Regex(compiling: "a|b+") + let r = try! Regex("a|b+") XCTAssert("palindrome".contains(r)) XCTAssert("botany".contains(r)) @@ -142,7 +142,7 @@ class RegexConsumerTests: XCTestCase { let s = "aaa | aaaaaa | aaaaaaaaaa" let s1 = s.dropFirst(6) // "aaaaaa | aaaaaaaaaa" let s2 = s1.dropLast(17) // "aa" - let regex = try! Regex(compiling: "a+") + let regex = try! Regex("a+") XCTAssertEqual(s.firstMatch(of: regex)?.0, "aaa") XCTAssertEqual(s1.firstMatch(of: regex)?.0, "aaaaaa") diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4d9ed4d01..e00c77f56 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1290,11 +1290,11 @@ extension RegexTests { 04: Arkansas 05: California """ - XCTAssertTrue(string.contains(try Regex(compiling: #"^\d+"#))) - XCTAssertEqual(string.ranges(of: try Regex(compiling: #"^\d+"#)).count, 1) - XCTAssertEqual(string.ranges(of: try Regex(compiling: #"(?m)^\d+"#)).count, 5) + XCTAssertTrue(string.contains(try Regex(#"^\d+"#))) + XCTAssertEqual(string.ranges(of: try Regex(#"^\d+"#)).count, 1) + XCTAssertEqual(string.ranges(of: try Regex(#"(?m)^\d+"#)).count, 5) - let regex = try Regex(compiling: #"^\d+: [\w ]+$"#) + let regex = try Regex(#"^\d+: [\w ]+$"#) XCTAssertFalse(string.contains(regex)) let allRanges = string.ranges(of: regex.anchorsMatchLineEndings()) XCTAssertEqual(allRanges.count, 5) @@ -1333,7 +1333,7 @@ extension RegexTests { } func testOptionMethods() throws { - let regex = try Regex(compiling: "c.f.") + let regex = try Regex("c.f.") XCTAssertTrue ("cafe".contains(regex)) XCTAssertFalse("CaFe".contains(regex)) From b24d3ea48808d9362406176eeeaaff10f8599508 Mon Sep 17 00:00:00 2001 From: Tina Liu <49205802+itingliu@users.noreply.github.com> Date: Thu, 21 Apr 2022 10:53:11 -0700 Subject: [PATCH 09/18] Move the closure argument to the end of the arg list (#307) Move the closure argument in `replace` and `replacing` to the end of the argument list for trailing closure syntax. Add a test for replacing within a range. --- .../Evolution/StringProcessingAlgorithms.md | 24 +++++----- .../Algorithms/Matching/MatchReplace.swift | 32 +++++++------- Tests/RegexBuilderTests/AlgorithmsTests.swift | 44 +++++++++++++++++++ 3 files changed, 72 insertions(+), 28 deletions(-) diff --git a/Documentation/Evolution/StringProcessingAlgorithms.md b/Documentation/Evolution/StringProcessingAlgorithms.md index b976c562e..8680ff75a 100644 --- a/Documentation/Evolution/StringProcessingAlgorithms.md +++ b/Documentation/Evolution/StringProcessingAlgorithms.md @@ -511,48 +511,48 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// the given regex are replaced by another regex match. /// - Parameters: /// - regex: A regex describing the sequence to replace. - /// - replacement: A closure that receives the full match information, - /// including captures, and returns a replacement collection. /// - subrange: The range in the collection in which to search for `regex`. /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. + /// - replacement: A closure that receives the full match information, + /// including captures, and returns a replacement collection. /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. public func replacing( _ regex: R, - with replacement: (RegexMatch) throws -> Replacement, subrange: Range, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (RegexMatch) throws -> Replacement ) rethrows -> Self where Replacement.Element == Element /// Returns a new collection in which all occurrences of a sequence matching /// the given regex are replaced by another collection. /// - Parameters: /// - regex: A regex describing the sequence to replace. - /// - replacement: A closure that receives the full match information, - /// including captures, and returns a replacement collection. /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. + /// - replacement: A closure that receives the full match information, + /// including captures, and returns a replacement collection. /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. public func replacing( _ regex: R, - with replacement: (RegexMatch) throws -> Replacement, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (RegexMatch) throws -> Replacement ) rethrows -> Self where Replacement.Element == Element /// Replaces all occurrences of the sequence matching the given regex with /// a given collection. /// - Parameters: /// - regex: A regex describing the sequence to replace. - /// - replacement: A closure that receives the full match information, - /// including captures, and returns a replacement collection. /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. + /// - replacement: A closure that receives the full match information, + /// including captures, and returns a replacement collection. public mutating func replace( _ regex: R, - with replacement: (RegexMatch) throws -> Replacement, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (RegexMatch) throws -> Replacement ) rethrows where Replacement.Element == Element } ``` diff --git a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift index 09e021a29..206d68554 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/MatchReplace.swift @@ -118,19 +118,19 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// the given regex are replaced by another regex match. /// - Parameters: /// - regex: A regex describing the sequence to replace. - /// - replacement: A closure that receives the full match information, - /// including captures, and returns a replacement collection. /// - subrange: The range in the collection in which to search for `regex`. /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. + /// - replacement: A closure that receives the full match information, + /// including captures, and returns a replacement collection. /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. @available(SwiftStdlib 5.7, *) public func replacing( _ regex: R, - with replacement: (Regex.Match) throws -> Replacement, subrange: Range, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (Regex.Match) throws -> Replacement ) rethrows -> Self where Replacement.Element == Element { precondition(maxReplacements >= 0) @@ -155,43 +155,43 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// the given regex are replaced by another collection. /// - Parameters: /// - regex: A regex describing the sequence to replace. - /// - replacement: A closure that receives the full match information, - /// including captures, and returns a replacement collection. /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. + /// - replacement: A closure that receives the full match information, + /// including captures, and returns a replacement collection. /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. @available(SwiftStdlib 5.7, *) public func replacing( _ regex: R, - with replacement: (Regex.Match) throws -> Replacement, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (Regex.Match) throws -> Replacement ) rethrows -> Self where Replacement.Element == Element { try replacing( regex, - with: replacement, subrange: startIndex..( _ regex: R, - with replacement: (Regex.Match) throws -> Replacement, - maxReplacements: Int = .max + maxReplacements: Int = .max, + with replacement: (Regex.Match) throws -> Replacement ) rethrows where Replacement.Element == Element { self = try replacing( regex, - with: replacement, subrange: startIndex..( + _ regex: R, + input: String, + _ replace: (Regex.Match) -> String, + _ tests: (subrange: Range, maxReplacement: Int, result: String)..., + file: StaticString = #file, + line: UInt = #line + ) { + for (subrange, maxReplacement, result) in tests { + XCTAssertEqual(input.replacing(regex, subrange: subrange, maxReplacements: maxReplacement, with: replace), result, file: file, line: line) + } + } + + let int = Capture(OneOrMore(.digit)) { Int($0)! } + + let addition = "9+16, 0+3, 5+5, 99+1" + + replaceTest( + Regex { int; "+"; int }, + input: "9+16, 0+3, 5+5, 99+1", + { match in "\(match.output.1 + match.output.2)" }, + + (subrange: addition.startIndex.. Date: Thu, 21 Apr 2022 14:26:50 -0500 Subject: [PATCH 10/18] Adds RegexBuilder.CharacterClass.anyUnicodeScalar (#315) This provides a RegexBuilder API that represents the same as `\O` in regex syntax. --- Sources/RegexBuilder/CharacterClass.swift | 4 ++++ .../_StringProcessing/_CharacterClassModel.swift | 15 ++++++++++++++- Tests/RegexTests/MatchTests.swift | 12 +++++------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 0087d734a..b7d8454bb 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -51,6 +51,10 @@ extension RegexComponent where Self == CharacterClass { public static var anyGrapheme: CharacterClass { .init(unconverted: .anyGrapheme) } + + public static var anyUnicodeScalar: CharacterClass { + .init(unconverted: .anyUnicodeScalar) + } public static var whitespace: CharacterClass { .init(unconverted: .whitespace) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 2debcda9d..c02725e33 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -33,6 +33,8 @@ public struct _CharacterClassModel: Hashable { case any /// Any grapheme cluster case anyGrapheme + /// Any Unicode scalar + case anyScalar /// Character.isDigit case digit /// Character.isHexDigit @@ -159,8 +161,12 @@ public struct _CharacterClassModel: Hashable { case .graphemeCluster: let c = str[i] var matched: Bool + var next = str.index(after: i) switch cc { case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = str.unicodeScalars.index(after: i) case .digit: matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: @@ -178,12 +184,13 @@ public struct _CharacterClassModel: Hashable { if isInverted { matched.toggle() } - return matched ? str.index(after: i) : nil + return matched ? next : nil case .unicodeScalar: let c = str.unicodeScalars[i] var matched: Bool switch cc { case .any: matched = true + case .anyScalar: matched = true case .anyGrapheme: fatalError("Not matched in this mode") case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) @@ -228,6 +235,10 @@ extension _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } + public static var anyUnicodeScalar: _CharacterClassModel { + .init(cc: .any, matchLevel: .unicodeScalar) + } + public static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } @@ -279,6 +290,7 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { switch self { case .any: return "" case .anyGrapheme: return "" + case .anyScalar: return "" case .digit: return "" case .hexDigit: return "" case .horizontalWhitespace: return "" @@ -445,6 +457,7 @@ extension AST.Atom.EscapedBuiltin { case .notWordCharacter: return .word.inverted case .graphemeCluster: return .anyGrapheme + case .trueAnychar: return .anyUnicodeScalar default: return nil diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e00c77f56..dab53cc1c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1512,13 +1512,11 @@ extension RegexTests { (eDecomposed, false)) // FIXME: \O is unsupported - firstMatchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"\O\u{301}"#, input: eComposed, match: nil, - xfail: true) - firstMatchTest(#"e\O"#, input: eComposed, match: nil, + firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, + xfail: true) + firstMatchTest(#"\O"#, input: eComposed, match: eComposed) + firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) matchTest( From 4857bc719518571042a57f9a95d3caba0fe2ca05 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 21 Apr 2022 14:36:28 -0500 Subject: [PATCH 11/18] Allow setting any of the three quant behaviors (#311) This also moves QuantificationBehavior from the RegexBuilder module down to _StringProcessing, and renames it to RegexRepetitionBehavior. --- Sources/RegexBuilder/DSL.swift | 44 +-- Sources/RegexBuilder/Variadics.swift | 308 +++++++++--------- .../VariadicsGenerator.swift | 10 +- .../Regex/AST/MatchingOptions.swift | 4 + Sources/_StringProcessing/ByteCodeGen.swift | 4 +- .../_StringProcessing/MatchingOptions.swift | 34 +- Sources/_StringProcessing/Regex/Options.swift | 66 +++- Tests/RegexBuilderTests/RegexDSLTests.swift | 101 ++++-- 8 files changed, 332 insertions(+), 239 deletions(-) diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 97bc35154..62aacc4af 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -94,40 +94,20 @@ extension UnicodeScalar: RegexComponent { // Note: Quantifiers are currently gyb'd. -/// Specifies how much to attempt to match when using a quantifier. -@available(SwiftStdlib 5.7, *) -public struct QuantificationBehavior { - internal enum Kind { - case eagerly - case reluctantly - case possessively - } - - var kind: Kind - - internal var astKind: DSLTree._AST.QuantificationKind { - switch kind { - case .eagerly: return .eager - case .reluctantly: return .reluctant - case .possessively: return .possessive - } - } -} - extension DSLTree.Node { /// Generates a DSLTree node for a repeated range of the given DSLTree node. /// Individual public API functions are in the generated Variadics.swift file. @available(SwiftStdlib 5.7, *) static func repeating( _ range: Range, - _ behavior: QuantificationBehavior?, + _ behavior: RegexRepetitionBehavior?, _ node: DSLTree.Node ) -> DSLTree.Node { // TODO: Throw these as errors assert(range.lowerBound >= 0, "Cannot specify a negative lower bound") assert(!range.isEmpty, "Cannot specify an empty range") - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default switch (range.lowerBound, range.upperBound) { case (0, Int.max): // 0... @@ -147,26 +127,6 @@ extension DSLTree.Node { } } -@available(SwiftStdlib 5.7, *) -extension QuantificationBehavior { - /// Match as much of the input string as possible, backtracking when - /// necessary. - public static var eagerly: QuantificationBehavior { - .init(kind: .eagerly) - } - - /// Match as little of the input string as possible, expanding the matched - /// region as necessary to complete a match. - public static var reluctantly: QuantificationBehavior { - .init(kind: .reluctantly) - } - - /// Match as much of the input string as possible, performing no backtracking. - public static var possessively: QuantificationBehavior { - .init(kind: .possessively) - } -} - @available(SwiftStdlib 5.7, *) public struct OneOrMore: _BuiltinRegexComponent { public var regex: Regex diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift index 3697be15e..356853ec5 100644 --- a/Sources/RegexBuilder/Variadics.swift +++ b/Sources/RegexBuilder/Variadics.swift @@ -615,9 +615,9 @@ extension Optionally { @_disfavoredOverload public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -627,10 +627,10 @@ extension Optionally { @available(SwiftStdlib 5.7, *) @_disfavoredOverload public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -650,9 +650,9 @@ extension ZeroOrMore { @_disfavoredOverload public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -662,10 +662,10 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) @_disfavoredOverload public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -677,9 +677,9 @@ extension OneOrMore { @_disfavoredOverload public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -689,10 +689,10 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) @_disfavoredOverload public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == Substring { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -727,7 +727,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == Substring, R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == Substring, R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -758,10 +758,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -780,9 +780,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -791,10 +791,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -805,9 +805,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -816,10 +816,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0), Component.RegexOutput == (W, C0) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -851,7 +851,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?), Component.RegexOutput == (W, C0), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -881,10 +881,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -903,9 +903,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -914,10 +914,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -928,9 +928,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -939,10 +939,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1), Component.RegexOutput == (W, C0, C1) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -974,7 +974,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?), Component.RegexOutput == (W, C0, C1), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1004,10 +1004,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1026,9 +1026,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1037,10 +1037,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1051,9 +1051,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1062,10 +1062,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2), Component.RegexOutput == (W, C0, C1, C2) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1097,7 +1097,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?), Component.RegexOutput == (W, C0, C1, C2), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1127,10 +1127,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1149,9 +1149,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1160,10 +1160,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1174,9 +1174,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1185,10 +1185,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3), Component.RegexOutput == (W, C0, C1, C2, C3) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1220,7 +1220,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?), Component.RegexOutput == (W, C0, C1, C2, C3), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1250,10 +1250,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1272,9 +1272,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1283,10 +1283,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1297,9 +1297,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1308,10 +1308,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4), Component.RegexOutput == (W, C0, C1, C2, C3, C4) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1343,7 +1343,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?), Component.RegexOutput == (W, C0, C1, C2, C3, C4), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1373,10 +1373,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1395,9 +1395,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1406,10 +1406,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1420,9 +1420,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1431,10 +1431,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1466,7 +1466,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1496,10 +1496,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1518,9 +1518,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1529,10 +1529,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1543,9 +1543,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1554,10 +1554,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1589,7 +1589,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1619,10 +1619,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1641,9 +1641,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1652,10 +1652,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1666,9 +1666,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1677,10 +1677,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1712,7 +1712,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1742,10 +1742,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1764,9 +1764,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1775,10 +1775,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1789,9 +1789,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1800,10 +1800,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1835,7 +1835,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } } @@ -1865,10 +1865,10 @@ extension Optionally { extension Optionally { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrOne, kind, component().regex.root)) } } @@ -1887,9 +1887,9 @@ extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } } @@ -1898,10 +1898,10 @@ extension ZeroOrMore { extension ZeroOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.zeroOrMore, kind, component().regex.root)) } } @@ -1912,9 +1912,9 @@ extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component.regex.root)) } } @@ -1923,10 +1923,10 @@ extension OneOrMore { extension OneOrMore { @available(SwiftStdlib 5.7, *) public init( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.oneOrMore, kind, component().regex.root)) } } @@ -1958,7 +1958,7 @@ extension Repeat { public init( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ expression: R, - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @RegexComponentBuilder _ component: () -> Component ) where RegexOutput == (Substring, C0?, C1?, C2?, C3?, C4?, C5?, C6?, C7?, C8?, C9?), Component.RegexOutput == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), R.Bound == Int { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) \(params.whereClauseForInit) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.\(kind.astQuantifierAmount), kind, component.regex.root)) } } @@ -389,10 +389,10 @@ struct VariadicsGenerator: ParsableCommand { \(defaultAvailableAttr) \(params.disfavored)\ public init<\(params.genericParams)>( - _ behavior: QuantificationBehavior? = nil, + _ behavior: RegexRepetitionBehavior? = nil, @\(concatBuilderName) _ component: () -> Component ) \(params.whereClauseForInit) { - let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.astKind) } ?? .default + let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default self.init(node: .quantification(.\(kind.astQuantifierAmount), kind, component().regex.root)) } } @@ -508,7 +508,7 @@ struct VariadicsGenerator: ParsableCommand { public init<\(params.genericParams), R: RangeExpression>( _ component: Component, _ expression: R, - _ behavior: QuantificationBehavior? = nil + _ behavior: RegexRepetitionBehavior? = nil ) \(params.repeatingWhereClause) { self.init(node: .repeating(expression.relative(to: 0.. Regex { - wrapInOption(.reluctantByDefault, addingIf: useReluctantQuantifiers) + /// Passing `.eager` or `.reluctant` to this method corresponds to applying + /// the `(?-U)` or `(?U)` option in regex syntax, respectively. + /// + /// - Parameter behavior: The default behavior to use for quantifiers. + public func repetitionBehavior(_ behavior: RegexRepetitionBehavior) -> Regex { + if behavior == .possessive { + return wrapInOption(.possessiveByDefault, addingIf: true) + } else { + return wrapInOption(.reluctantByDefault, addingIf: behavior == .reluctant) + } } /// Returns a regular expression that matches with the specified semantic @@ -183,6 +189,46 @@ public struct RegexWordBoundaryKind: Hashable { } } +/// Specifies how much to attempt to match when using a quantifier. +@available(SwiftStdlib 5.7, *) +public struct RegexRepetitionBehavior: Hashable { + internal enum Kind { + case eager + case reluctant + case possessive + } + + var kind: Kind + + @_spi(RegexBuilder) public var dslTreeKind: DSLTree._AST.QuantificationKind { + switch kind { + case .eager: return .eager + case .reluctant: return .reluctant + case .possessive: return .possessive + } + } +} + +@available(SwiftStdlib 5.7, *) +extension RegexRepetitionBehavior { + /// Match as much of the input string as possible, backtracking when + /// necessary. + public static var eager: Self { + .init(kind: .eager) + } + + /// Match as little of the input string as possible, expanding the matched + /// region as necessary to complete a match. + public static var reluctant: Self { + .init(kind: .reluctant) + } + + /// Match as much of the input string as possible, performing no backtracking. + public static var possessive: Self { + .init(kind: .possessive) + } +} + // MARK: - Helper method @available(SwiftStdlib 5.7, *) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 6d74de826..cc5afda39 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -272,7 +272,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.word) Anchor.wordBoundary } - OneOrMore(.any, .reluctantly) + OneOrMore(.any, .reluctant) "stop" " " @@ -281,7 +281,7 @@ class RegexDSLTests: XCTestCase { Anchor.wordBoundary } .wordBoundaryKind(.unicodeLevel1) - OneOrMore(.any, .reluctantly) + OneOrMore(.any, .reluctant) "stop" } } @@ -293,14 +293,14 @@ class RegexDSLTests: XCTestCase { Capture { // Reluctant behavior due to option OneOrMore(.anyOf("abcd")) - .reluctantQuantifiers() + .repetitionBehavior(.reluctant) } ZeroOrMore("a"..."z") Capture { // Eager behavior due to explicit parameter, despite option - OneOrMore(.digit, .eagerly) - .reluctantQuantifiers() + OneOrMore(.digit, .eager) + .repetitionBehavior(.reluctant) } ZeroOrMore(.digit) } @@ -319,6 +319,7 @@ class RegexDSLTests: XCTestCase { } func testQuantificationBehavior() throws { + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), matchType: (Substring, Substring).self, ==) @@ -328,41 +329,93 @@ class RegexDSLTests: XCTestCase { ZeroOrMore(.any) } + // Explicitly reluctant try _testDSLCaptures( ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore(.word, .reluctantly) + OneOrMore(.word, .reluctant) Capture(.digit) ZeroOrMore(.any) } - -#if os(macOS) - try XCTExpectFailure("'relucantCaptures()' API should only affect regex literals") { - try _testDSLCaptures( - ("abc1def2", ("abc1def2", "2")), - matchType: (Substring, Substring).self, ==) - { - Regex { - OneOrMore(.word) - Capture(.digit) - ZeroOrMore(.any) - }.reluctantQuantifiers() - } - } -#endif - + // Explicitly reluctant overrides default option try _testDSLCaptures( ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore(.reluctantly) { + OneOrMore(.reluctant) { .word - } + }.repetitionBehavior(.possessive) Capture(.digit) ZeroOrMore(.any) } + // Default set to reluctant + try _testDSLCaptures( + ("abc1def2", ("abc1def2", "1")), + matchType: (Substring, Substring).self, ==) + { + Regex { + OneOrMore(.word) + Capture(.digit) + ZeroOrMore(.any) + }.repetitionBehavior(.reluctant) + } + // Default set to reluctant applies to regex syntax + try _testDSLCaptures( + ("abc1def2", ("abc1def2", "1")), + matchType: (Substring, Substring).self, ==) + { + try! Regex(#"\w+(\d).*"#, as: (Substring, Substring).self) + .repetitionBehavior(.reluctant) + } + // Explicitly possessive + try _testDSLCaptures( + ("aaaa", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore("a", .possessive) + "a" + } + } + // Default set to possessive + try _testDSLCaptures( + ("aaaa", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore("a") + "a" + }.repetitionBehavior(.possessive) + } + // More specific default set to eager + try _testDSLCaptures( + ("aaaa", ("aaaa", "aaa")), + matchType: (Substring, Substring).self, ==) + { + Regex { + Capture { + OneOrMore("a") + .repetitionBehavior(.eager) + } + OneOrMore("a") + }.repetitionBehavior(.possessive) + } + // More specific default set to reluctant + try _testDSLCaptures( + ("aaaa", ("aaaa", "a")), + matchType: (Substring, Substring).self, ==) + { + Regex { + Capture { + OneOrMore("a") + .repetitionBehavior(.reluctant) + } + OneOrMore("a") + }.repetitionBehavior(.possessive) + } + try _testDSLCaptures( ("abc1def2", "abc1def2"), matchType: Substring.self, ==) From 73a5ccf4b376ca484df5714537d201ea18fc6d5e Mon Sep 17 00:00:00 2001 From: Tina Liu <49205802+itingliu@users.noreply.github.com> Date: Fri, 22 Apr 2022 06:39:04 -0700 Subject: [PATCH 12/18] Add `wholeMatch` and `prefixMatch` (#286) Add the functions to string processing algorithms proposal and implement the change. Move the functions from `String` and `SubString` extensions to `BidirectionalCollection`. Add tests for `firstMatch`, `wholeMatch`, and `prefixMatch` that use a custom `BidirectionalCollection` type. --- .../Evolution/StringProcessingAlgorithms.md | 23 ++- .../Algorithms/Matching/FirstMatch.swift | 1 + Sources/_StringProcessing/Regex/Match.swift | 27 +-- Tests/RegexBuilderTests/CustomTests.swift | 161 ++++++++++++++++++ 4 files changed, 188 insertions(+), 24 deletions(-) diff --git a/Documentation/Evolution/StringProcessingAlgorithms.md b/Documentation/Evolution/StringProcessingAlgorithms.md index 8680ff75a..74416ae63 100644 --- a/Documentation/Evolution/StringProcessingAlgorithms.md +++ b/Documentation/Evolution/StringProcessingAlgorithms.md @@ -162,10 +162,11 @@ We also propose the following regex-powered algorithms as well as their generic |`replace(:with:subrange:maxReplacements)`| Replaces all occurrences of the sequence matching the given `RegexComponent` or sequence with a given collection | |`split(by:)`| Returns the longest possible subsequences of the collection around elements equal to the given separator | |`firstMatch(of:)`| Returns the first match of the specified `RegexComponent` within the collection | +|`wholeMatch(of:)`| Matches the specified `RegexComponent` in the collection as a whole | +|`prefixMatch(of:)`| Matches the specified `RegexComponent` against the collection at the beginning | |`matches(of:)`| Returns a collection containing all matches of the specified `RegexComponent` | - ## Detailed design ### `CustomMatchingRegexComponent` @@ -389,7 +390,7 @@ extension BidirectionalCollection where SubSequence == Substring { } ``` -#### First match +#### Match ```swift extension BidirectionalCollection where SubSequence == Substring { @@ -398,6 +399,16 @@ extension BidirectionalCollection where SubSequence == Substring { /// - Returns: The first match of `regex` in the collection, or `nil` if /// there isn't a match. public func firstMatch(of regex: R) -> RegexMatch? + + /// Match a regex in its entirety. + /// - Parameter r: The regex to match against. + /// - Returns: The match if there is one, or `nil` if none. + public func wholeMatch(of r: R) -> Regex.Match? + + /// Match part of the regex, starting at the beginning. + /// - Parameter r: The regex to match against. + /// - Returns: The match if there is one, or `nil` if none. + public func prefixMatch(of r: R) -> Regex.Match? } ``` @@ -473,7 +484,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` in `subrange` are replaced by `replacement`. public func replacing( - _ regex: R, + _ r: R, with replacement: Replacement, subrange: Range, maxReplacements: Int = .max @@ -489,7 +500,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// - Returns: A new collection in which all occurrences of subsequence /// matching `regex` are replaced by `replacement`. public func replacing( - _ regex: R, + _ r: R, with replacement: Replacement, maxReplacements: Int = .max ) -> Self where Replacement.Element == Element @@ -502,7 +513,7 @@ extension RangeReplaceableCollection where SubSequence == Substring { /// - maxReplacements: A number specifying how many occurrences of the /// sequence matching `regex` to replace. Default is `Int.max`. public mutating func replace( - _ regex: R, + _ r: R, with replacement: Replacement, maxReplacements: Int = .max ) where Replacement.Element == Element @@ -609,4 +620,4 @@ Trimming a string from both sides shares a similar story. For example, `"ababa". ### Future API -Some Python functions are not currently included in this proposal, such as trimming the suffix from a string/collection. This pitch aims to establish a pattern for using `RegexComponent` with string processing algorithms, so that further enhancement can to be introduced to the standard library easily in the future, and eventually close the gap between Swift and other popular scripting languages. +Some common string processing functions are not currently included in this proposal, such as trimming the suffix from a string/collection, and finding overlapping ranges of matched substrings. This pitch aims to establish a pattern for using `RegexComponent` with string processing algorithms, so that further enhancement can to be introduced to the standard library easily in the future, and eventually close the gap between Swift and other popular scripting languages. diff --git a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift index cb527f948..4342391af 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/FirstMatch.swift @@ -39,6 +39,7 @@ extension BidirectionalCollection { extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) + @_disfavoredOverload func firstMatch( of regex: R ) -> _MatchResult>? { diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 45b177867..e38af43f8 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -159,32 +159,23 @@ extension Regex { } @available(SwiftStdlib 5.7, *) -extension String { +extension BidirectionalCollection where SubSequence == Substring { + /// Match a regex in its entirety. + /// - Parameter r: The regex to match against. + /// - Returns: The match if there is one, or `nil` if none. public func wholeMatch( of r: R ) -> Regex.Match? { - try? r.regex.wholeMatch(in: self) + try? r.regex.wholeMatch(in: self[...].base) } + /// Match part of the regex, starting at the beginning. + /// - Parameter r: The regex to match against. + /// - Returns: The match if there is one, or `nil` if none. public func prefixMatch( of r: R ) -> Regex.Match? { - try? r.regex.prefixMatch(in: self) - } -} - -@available(SwiftStdlib 5.7, *) -extension Substring { - public func wholeMatch( - of r: R - ) -> Regex.Match? { - try? r.regex.wholeMatch(in: self) - } - - public func prefixMatch( - of r: R - ) -> Regex.Match? { - try? r.regex.prefixMatch(in: self) + try? r.regex.prefixMatch(in: self[...]) } } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index bf4489a68..d17c3a142 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -133,6 +133,51 @@ func customTest( } } +// Test support +struct Concat : Equatable { + var wrapped: String + init(_ name: String, _ suffix: Int?) { + if let suffix = suffix { + wrapped = name + String(suffix) + } else { + wrapped = name + } + } +} + +extension Concat : Collection { + typealias Index = String.Index + typealias Element = String.Element + + var startIndex: Index { return wrapped.startIndex } + var endIndex: Index { return wrapped.endIndex } + + subscript(position: Index) -> Element { + return wrapped[position] + } + + func index(after i: Index) -> Index { + return wrapped.index(after: i) + } +} + +extension Concat: BidirectionalCollection { + typealias Indices = String.Indices + typealias SubSequence = String.SubSequence + + func index(before i: Index) -> Index { + return wrapped.index(before: i) + } + + var indices: Indices { + wrapped.indices + } + + subscript(bounds: Range) -> Substring { + Substring(wrapped[bounds]) + } +} + class CustomRegexComponentTests: XCTestCase { // TODO: Refactor below into more exhaustive, declarative // tests. @@ -467,4 +512,120 @@ class CustomRegexComponentTests: XCTestCase { ) } + + + func testMatchVarients() { + func customTest( + _ regex: Regex, + _ input: Concat, + expected: (wholeMatch: Match?, firstMatch: Match?, prefixMatch: Match?), + file: StaticString = #file, line: UInt = #line + ) { + let wholeResult = input.wholeMatch(of: regex)?.output + let firstResult = input.firstMatch(of: regex)?.output + let prefixResult = input.prefixMatch(of: regex)?.output + XCTAssertEqual(wholeResult, expected.wholeMatch, file: file, line: line) + XCTAssertEqual(firstResult, expected.firstMatch, file: file, line: line) + XCTAssertEqual(prefixResult, expected.prefixMatch, file: file, line: line) + } + + typealias CaptureMatch1 = (Substring, Int?) + func customTest( + _ regex: Regex, + _ input: Concat, + expected: (wholeMatch: CaptureMatch1?, firstMatch: CaptureMatch1?, prefixMatch: CaptureMatch1?), + file: StaticString = #file, line: UInt = #line + ) { + let wholeResult = input.wholeMatch(of: regex)?.output + let firstResult = input.firstMatch(of: regex)?.output + let prefixResult = input.prefixMatch(of: regex)?.output + XCTAssertEqual(wholeResult?.0, expected.wholeMatch?.0, file: file, line: line) + XCTAssertEqual(wholeResult?.1, expected.wholeMatch?.1, file: file, line: line) + + XCTAssertEqual(firstResult?.0, expected.firstMatch?.0, file: file, line: line) + XCTAssertEqual(firstResult?.1, expected.firstMatch?.1, file: file, line: line) + + XCTAssertEqual(prefixResult?.0, expected.prefixMatch?.0, file: file, line: line) + XCTAssertEqual(prefixResult?.1, expected.prefixMatch?.1, file: file, line: line) + } + + var regex = Regex { + OneOrMore(.digit) + } + + customTest(regex, Concat("amy", 2023), expected:(nil, "2023", nil)) // amy2023 + customTest(regex, Concat("amy2023", nil), expected:(nil, "2023", nil)) + customTest(regex, Concat("amy", nil), expected:(nil, nil, nil)) + customTest(regex, Concat("", 2023), expected:("2023", "2023", "2023")) // 2023 + customTest(regex, Concat("bob012b", 2023), expected:(nil, "012", nil)) // b012b2023 + customTest(regex, Concat("bob012b", nil), expected:(nil, "012", nil)) + customTest(regex, Concat("007bob", 2023), expected:(nil, "007", "007")) + customTest(regex, Concat("", nil), expected:(nil, nil, nil)) + + regex = Regex { + OneOrMore(CharacterClass("a"..."z")) + } + + customTest(regex, Concat("amy", 2023), expected:(nil, "amy", "amy")) // amy2023 + customTest(regex, Concat("amy", nil), expected:("amy", "amy", "amy")) + customTest(regex, Concat("amy2022-bob", 2023), expected:(nil, "amy", "amy")) // amy2023 + customTest(regex, Concat("", 2023), expected:(nil, nil, nil)) // 2023 + customTest(regex, Concat("bob012b", 2023), expected:(nil, "bob", "bob")) // b012b2023 + customTest(regex, Concat("bob012b", nil), expected:(nil, "bob", "bob")) + customTest(regex, Concat("007bob", 2023), expected:(nil, "bob", nil)) + customTest(regex, Concat("", nil), expected:(nil, nil, nil)) + + regex = Regex { + OneOrMore { + CharacterClass("A"..."Z") + OneOrMore(CharacterClass("a"..."z")) + Repeat(.digit, count: 2) + } + } + + customTest(regex, Concat("Amy12345", nil), expected:(nil, "Amy12", "Amy12")) + customTest(regex, Concat("Amy", 2023), expected:(nil, "Amy20", "Amy20")) + customTest(regex, Concat("Amy", 23), expected:("Amy23", "Amy23", "Amy23")) + customTest(regex, Concat("", 2023), expected:(nil, nil, nil)) // 2023 + customTest(regex, Concat("Amy23 Boba17", nil), expected:(nil, "Amy23", "Amy23")) + customTest(regex, Concat("amy23 Boba17", nil), expected:(nil, "Boba17", nil)) + customTest(regex, Concat("Amy23 boba17", nil), expected:(nil, "Amy23", "Amy23")) + customTest(regex, Concat("amy23 Boba", 17), expected:(nil, "Boba17", nil)) + customTest(regex, Concat("Amy23Boba17", nil), expected:("Amy23Boba17", "Amy23Boba17", "Amy23Boba17")) + customTest(regex, Concat("Amy23Boba", 17), expected:("Amy23Boba17", "Amy23Boba17", "Amy23Boba17")) + customTest(regex, Concat("23 Boba", 17), expected:(nil, "Boba17", nil)) + + let twoDigitRegex = Regex { + OneOrMore { + CharacterClass("A"..."Z") + OneOrMore(CharacterClass("a"..."z")) + Capture(Repeat(.digit, count: 2)) { Int($0) } + } + } + + customTest(twoDigitRegex, Concat("Amy12345", nil), expected: (nil, ("Amy12", 12), ("Amy12", 12))) + customTest(twoDigitRegex, Concat("Amy", 12345), expected: (nil, ("Amy12", 12), ("Amy12", 12))) + customTest(twoDigitRegex, Concat("Amy", 12), expected: (("Amy12", 12), ("Amy12", 12), ("Amy12", 12))) + customTest(twoDigitRegex, Concat("Amy23 Boba", 17), expected: (nil, firstMatch: ("Amy23", 23), prefixMatch: ("Amy23", 23))) + customTest(twoDigitRegex, Concat("amy23 Boba20", 23), expected:(nil, ("Boba20", 20), nil)) + customTest(twoDigitRegex, Concat("Amy23Boba17", nil), expected:(("Amy23Boba17", 17), ("Amy23Boba17", 17), ("Amy23Boba17", 17))) + customTest(twoDigitRegex, Concat("Amy23Boba", 17), expected:(("Amy23Boba17", 17), ("Amy23Boba17", 17), ("Amy23Boba17", 17))) + + let millennium = Regex { + CharacterClass("A"..."Z") + OneOrMore(CharacterClass("a"..."z")) + Capture { Repeat(.digit, count: 4) } transform: { v -> Int? in + guard let year = Int(v) else { return nil } + return year > 2000 ? year : nil + } + } + + customTest(millennium, Concat("Amy2025", nil), expected: (("Amy2025", 2025), ("Amy2025", 2025), ("Amy2025", 2025))) + customTest(millennium, Concat("Amy", 2025), expected: (("Amy2025", 2025), ("Amy2025", 2025), ("Amy2025", 2025))) + customTest(millennium, Concat("Amy1995", nil), expected: (("Amy1995", nil), ("Amy1995", nil), ("Amy1995", nil))) + customTest(millennium, Concat("Amy", 1995), expected: (("Amy1995", nil), ("Amy1995", nil), ("Amy1995", nil))) + customTest(millennium, Concat("amy2025", nil), expected: (nil, nil, nil)) + customTest(millennium, Concat("amy", 2025), expected: (nil, nil, nil)) + } } + From 3e2160c88fd2577e93f93ea99fd0900f0d44cf7b Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 22 Apr 2022 07:49:58 -0600 Subject: [PATCH 13/18] Update local proposal copies (#317) * Bring run time proposal up to speed * Update repo doc to be in line with SE --- .../RegexSyntaxRunTimeConstruction.md | 25 +++++++++++++-- Documentation/Evolution/RegexTypeOverview.md | 31 +++++++++++++++---- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md index 1a868aa04..5c9fa6c59 100644 --- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md @@ -1,7 +1,12 @@ # Regex Syntax and Run-time Construction -- Authors: [Hamish Knight](https://github.com/hamishknight), [Michael Ilseman](https://github.com/milseman) +* Proposal: [SE-NNNN](NNNN-filename.md) +* Authors: [Hamish Knight](https://github.com/hamishknight), [Michael Ilseman](https://github.com/milseman) +* Review Manager: [Ben Cohen](https://github.com/airspeedswift) +* Status: **Awaiting review** +* Implementation: https://github.com/apple/swift-experimental-string-processing + * Available in nightly toolchain snapshots with `import _StringProcessing` ## Introduction @@ -81,11 +86,11 @@ We propose initializers to declare and compile a regex from syntax. Upon failure ```swift extension Regex { /// Parse and compile `pattern`, resulting in a strongly-typed capture list. - public init(compiling pattern: String, as: Output.Type = Output.self) throws + public init(_ pattern: String, as: Output.Type = Output.self) throws } extension Regex where Output == AnyRegexOutput { /// Parse and compile `pattern`, resulting in an existentially-typed capture list. - public init(compiling pattern: String) throws + public init(_ pattern: String) throws } ``` @@ -156,6 +161,20 @@ extension Regex.Match where Output == AnyRegexOutput { } ``` +We propose adding API to query and access captures by name in an existentially typed regex match: + +```swift +extension Regex.Match where Output == AnyRegexOutput { + /// If a named-capture with `name` is present, returns its value. Otherwise `nil`. + public subscript(_ name: String) -> AnyRegexOutput.Element? { get } +} + +extension AnyRegexOutput { + /// If a named-capture with `name` is present, returns its value. Otherwise `nil`. + public subscript(_ name: String) -> AnyRegexOutput.Element? { get } +} +``` + The rest of this proposal will be a detailed and exhaustive definition of our proposed regex syntax.
Grammar Notation diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md index 9fd369dbf..6eed648f0 100644 --- a/Documentation/Evolution/RegexTypeOverview.md +++ b/Documentation/Evolution/RegexTypeOverview.md @@ -1,6 +1,11 @@ # Regex Type and Overview -- Authors: [Michael Ilseman](https://github.com/milseman) +* Proposal: [SE-0350](0350-regex-type-overview.md) +* Authors: [Michael Ilseman](https://github.com/milseman) +* Review Manager: [Ben Cohen](https://github.com/airspeedswift) +* Status: **Active Review (4 - 28 April 2022)** +* Implementation: https://github.com/apple/swift-experimental-string-processing + * Available in nightly toolchain snapshots with `import _StringProcessing` ## Introduction @@ -207,7 +212,7 @@ func processEntry(_ line: String) -> Transaction? { // amount: Substring // )> - guard let match = regex.matchWhole(line), + guard let match = regex.wholeMatch(line), let kind = Transaction.Kind(match.kind), let date = try? Date(String(match.date), strategy: dateParser), let amount = try? Decimal(String(match.amount), format: decimalParser) @@ -384,21 +389,25 @@ extension Regex.Match { // Run-time compilation interfaces extension Regex { /// Parse and compile `pattern`, resulting in a strongly-typed capture list. - public init(compiling pattern: String, as: Output.Type = Output.self) throws + public init(_ pattern: String, as: Output.Type = Output.self) throws } extension Regex where Output == AnyRegexOutput { /// Parse and compile `pattern`, resulting in an existentially-typed capture list. - public init(compiling pattern: String) throws + public init(_ pattern: String) throws } ``` +### Cancellation + +Regex is somewhat different from existing standard library operations in that regex processing can be a long-running task. +For this reason regex algorithms may check if the parent task has been cancelled and end execution. + ### On severability and related proposals The proposal split presented is meant to aid focused discussion, while acknowledging that each is interconnected. The boundaries between them are not completely cut-and-dry and could be refined as they enter proposal phase. Accepting this proposal in no way implies that all related proposals must be accepted. They are severable and each should stand on their own merit. - ## Source compatibility Everything in this proposal is additive. Regex delimiters may have their own source compatibility impact, which is discussed in that proposal. @@ -488,6 +497,16 @@ The generic parameter `Output` is proposed to contain both the whole match (the The biggest issue with this alternative design is that the numbering of `Captures` elements misaligns with the numbering of captures in textual regexes, where backreference `\0` refers to the entire match and captures start at `\1`. This design would sacrifice familarity and have the pitfall of introducing off-by-one errors. +### Encoding `Regex`es into the type system + +During the initial review period the following comment was made: + +> I think the goal should be that, at least for regex literals (and hopefully for the DSL to some extent), one day we might not even need a bytecode or interpreter. I think the ideal case is if each literal was its own function or type that gets generated and optimised as if you wrote it in Swift. + +This is an approach that has been tried a few times in a few different languages (including by a few members of the Swift Standard Library and Core teams), and while it can produce attractive microbenchmarks, it has almost always proved to be a bad idea at the macro scale. In particular, even if we set aside witness tables and other associated swift generics overhead, optimizing a fixed pipeline for each pattern you want to match causes significant codesize expansion when there are multiple patterns in use, as compared to a more flexible byte code interpreter. A bytecode interpreter makes better use of instruction caches and memory, and can also benefit from micro architectural resources that are shared across different patterns. There is a tradeoff w.r.t. branch prediction resources, where separately compiled patterns may have more decisive branch history data, but a shared bytecode engine has much more data to use; this tradeoff tends to fall on the side of a bytecode engine, but it does not always do so. + +It should also be noted that nothing prevents AOT or JIT compiling of the bytecode if we believe it will be advantageous, but compiling or interpreting arbitrary Swift code at runtime is rather more unattractive, since both the type system and language are undecidable. Even absent this rationale, we would probably not encode regex programs directly into the type system simply because it is unnecessarily complex. + ### Future work: static optimization and compilation Swift's support for static compilation is still developing, and future work here is leveraging that to compile regex when profitable. Many regex describe simple [DFAs](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) and can be statically compiled into very efficient programs. Full static compilation needs to be balanced with code size concerns, as a matching-specific bytecode is typically far smaller than a corresponding program (especially since the bytecode interpreter is shared). @@ -551,4 +570,4 @@ Regexes are often used for tokenization and tokens can be represented with Swift --> -[pitches]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/ProposalOverview.md +[pitches]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/ProposalOverview.md \ No newline at end of file From 53acbb2fc517c5fcb58267e0b9a7daef0c89d003 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 22 Apr 2022 10:02:20 -0600 Subject: [PATCH 14/18] Update ProposalOverview.md --- Documentation/Evolution/ProposalOverview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 4346932b5..45712ea1f 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -19,7 +19,7 @@ Covers the result builder approach and basic API. ## Run-time Regex Construction -- [Pitch](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md) +- [Pitch](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md), [Thread](https://forums.swift.org/t/pitch-2-regex-syntax-and-run-time-construction/56624) - (old) Pitch thread: [Regex Syntax](https://forums.swift.org/t/pitch-regex-syntax/55711) + Brief: Syntactic superset of PCRE2, Oniguruma, ICU, UTS\#18, etc. From b057c4e7415516d2fc57ff7fbdae95bd5f7a4d3e Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 22 Apr 2022 10:03:09 -0600 Subject: [PATCH 15/18] Update ProposalOverview.md --- Documentation/Evolution/ProposalOverview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 45712ea1f..8fa6096e8 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -27,7 +27,7 @@ Covers the "interior" syntax, extended syntaxes, run-time construction of a rege ## Regex Literals -- [Draft](https://github.com/apple/swift-experimental-string-processing/pull/187) +- [Draft](https://github.com/apple/swift-experimental-string-processing/pull/187), [Thread](https://forums.swift.org/t/pitch-2-regex-literals/56736) - (Old) original pitch: + [Thread](https://forums.swift.org/t/pitch-regular-expression-literals/52820) + [Update](https://forums.swift.org/t/pitch-regular-expression-literals/52820/90) From 8dd8470fb75cce92a1f9204b96f2fce93d296b95 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 22 Apr 2022 11:26:58 -0500 Subject: [PATCH 16/18] Unicode for String Processing proposal (#257) --- Documentation/Evolution/CharacterClasses.md | 503 ---------- Documentation/Evolution/ProposalOverview.md | 6 +- .../Evolution/UnicodeForStringProcessing.md | 872 ++++++++++++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 8 + 4 files changed, 883 insertions(+), 506 deletions(-) delete mode 100644 Documentation/Evolution/CharacterClasses.md create mode 100644 Documentation/Evolution/UnicodeForStringProcessing.md diff --git a/Documentation/Evolution/CharacterClasses.md b/Documentation/Evolution/CharacterClasses.md deleted file mode 100644 index c9ffcbc95..000000000 --- a/Documentation/Evolution/CharacterClasses.md +++ /dev/null @@ -1,503 +0,0 @@ -# Character Classes for String Processing - -- **Authors:** [Nate Cook](https://github.com/natecook1000), [Michael Ilseman](https://github.com/milseman) -- **Status:** Draft pitch - -## Introduction - -[Declarative String Processing Overview][overview] presents regex-powered matching broadly, without details concerning syntax and semantics, leaving clarification to subsequent pitches. [Regular Expression Literals][literals] presents more details on regex _syntax_ such as delimiters and PCRE-syntax innards, but explicitly excludes discussion of regex _semantics_. This pitch and discussion aims to address a targeted subset of regex semantics: definitions of character classes. We propose a comprehensive treatment of regex character class semantics in the context of existing and newly proposed API directly on `Character` and `Unicode.Scalar`. - -Character classes in regular expressions include metacharacters like `\d` to match a digit, `\s` to match whitespace, and `.` to match any character. Individual literal characters can also be thought of as character classes, as they at least match themselves, and, in case-insensitive matching, their case-toggled counterpart. For the purpose of this work, then, we consider a *character class* to be any part of a regular expression literal that can match an actual component of a string. - -## Motivation - -Operating over classes of characters is a vital component of string processing. Swift's `String` provides, by default, a view of `Character`s or [extended grapheme clusters][graphemes] whose comparison honors [Unicode canonical equivalence][canoneq]. - -```swift -let str = "Cafe\u{301}" // "Café" -str == "Café" // true -str.dropLast() // "Caf" -str.last == "é" // true (precomposed e with acute accent) -str.last == "e\u{301}" // true (e followed by composing acute accent) -``` - -Unicode leaves all interpretation of grapheme clusters up to implementations, which means that Swift needs to define any semantics for its own usage. Since other regex engines operate, at most, at the semantics level of Unicode scalar values, there is little to no prior art to consult. - -
Other engines - -Character classes in other languages match at either the Unicode scalar value level, or even the code unit level, instead of recognizing grapheme clusters as characters. When matching the `.` character class, other languages will only match the first part of an `"e\u{301}"` grapheme cluster. Some languages, like Perl, Ruby, and Java, support an additional `\X` metacharacter, which explicitly represents a single grapheme cluster. - -| Matching `"Cafe\u{301}"` | Pattern: `^Caf.` | Remaining | Pattern: `^Caf\X` | Remaining | -|---|---|---|---|---| -| C#, Rust, Go | `"Cafe"` | `"´"` | n/a | n/a | -| NSString, Java, Ruby, Perl | `"Cafe"` | `"´"` | `"Café"` | `""` | - -Other than Java's `CANON_EQ` option, the vast majority of other languages and engines are not capable of comparing with canonical equivalence. - -
- -[SE-0211 Unicode Scalar Properties][scalarprops] added basic building blocks for classification of scalars by surfacing Unicode data from the [UCD][ucd]. [SE-0221: Character Properties][charprops] defined grapheme-cluster semantics for Swift for a subset of these. But, many classifications used in string processing are combinations of scalar properties or ad-hoc listings, and as such are not present today in Swift. - -Regardless of any syntax or underlying formalism, classifying characters is a worthy and much needed addition to the Swift standard library. We believe our thorough treatment of every character class found across many popular regex engines gives Swift a solid semantic basis. - -## Proposed Solution - -This pitch is narrowly scoped to Swift definitions of character classes found in regexes. For each character class, we propose: - -- A name for use in API -- A `Character` API, by extending Unicode scalar definitions to grapheme clusters -- A `Unicode.Scalar` API with modern Unicode definitions -- If applicable, a `Unicode.Scalar` API for notable standards like POSIX - -We're proposing what we believe to be the Swiftiest definitions using [Unicode's guidance][uts18] for `Unicode.Scalar` and extending this to grapheme clusters using `Character`'s existing [rationale][charpropsrationale]. - -
Broad language/engine survey - -For these definitions, we cross-referenced Unicode's [UTS\#18][uts18] with a broad survey of existing languages and engines. We found that while these all support a subset of UTS\#18, each language or framework implements a slightly different subset. The following table shows some of the variations: - -| Language/Framework | Dot (`.`) matches | Supports `\X` | Canonical Equivalence | `\d` matches FULL WIDTH digit | -|------------------------------|----------------------------------------------------|---------------|---------------------------|-------------------------------| -| [ECMAScript][ecmascript] | UTF16 code unit (Unicode scalar in Unicode mode) | no | no | no | -| [Perl][perl] / [PCRE][pcre] | UTF16 code unit, (Unicode scalar in Unicode mode) | yes | no | no | -| [Python3][python] | Unicode scalar | no | no | yes | -| [Raku][raku] | Grapheme cluster | n/a | strings always normalized | yes | -| [Ruby][ruby] | Unicode scalar | yes | no | no | -| [Rust][rust] | Unicode scalar | no | no | no | -| [C#][csharp] | UTF16 code unit | no | no | yes | -| [Java][java] | Unicode scalar | yes | Only in CANON_EQ mode | no | -| [Go][go] | Unicode scalar | no | no | no | -| [`NSRegularExpression`][icu] | Unicode scalar | yes | no | yes | - -We are still in the process of evaluating [C++][cplusplus], [RE2][re2], and [Oniguruma][oniguruma]. - -
- -## Detailed Design - -### Literal characters - -A literal character (such as `a`, `é`, or `한`) in a regex literal matches that particular character or code sequence. When matching at the semantic level of `Unicode.Scalar`, it should match the literal sequence of scalars. When matching at the semantic level of `Character`, it should match `Character`-by-`Character`, honoring Unicode canonical equivalence. - -We are not proposing new API here as this is already handled by `String` and `String.UnicodeScalarView`'s conformance to `Collection`. - -### Unicode values: `\u`, `\U`, `\x` - -Metacharacters that begin with `\u`, `\U`, or `\x` match a character with the specified Unicode scalar values. We propose these be treated exactly the same as literals. - -### Match any: `.`, `\X` - -The dot metacharacter matches any single character or element. Depending on options and modes, it may exclude newlines. - -`\X` matches any grapheme cluster (`Character`), even when the regular expression is otherwise matching at semantic level of `Unicode.Scalar`. - -We are not proposing new API here as this is already handled by collection conformances. - -While we would like for the stdlib to have grapheme-breaking API over collections of `Unicode.Scalar`, that is a separate discussion and out-of-scope for this pitch. - -### Decimal digits: `\d`,`\D` - -We propose `\d` be named "decimalDigit" with the following definitions: - -```swift -extension Character { - /// A Boolean value indicating whether this character represents - /// a decimal digit. - /// - /// Decimal digits are comprised of a single Unicode scalar that has a - /// `numericType` property equal to `.decimal`. This includes the digits - /// from the ASCII range, from the _Halfwidth and Fullwidth Forms_ Unicode - /// block, as well as digits in some scripts, like `DEVANAGARI DIGIT NINE` - /// (U+096F). - /// - /// Decimal digits are a subset of whole numbers, see `isWholeNumber`. - /// - /// To get the character's value, use the `decimalDigitValue` property. - public var isDecimalDigit: Bool { get } - - /// The numeric value this character represents, if it is a decimal digit. - /// - /// Decimal digits are comprised of a single Unicode scalar that has a - /// `numericType` property equal to `.decimal`. This includes the digits - /// from the ASCII range, from the _Halfwidth and Fullwidth Forms_ Unicode - /// block, as well as digits in some scripts, like `DEVANAGARI DIGIT NINE` - /// (U+096F). - /// - /// Decimal digits are a subset of whole numbers, see `wholeNumberValue`. - /// - /// let chars: [Character] = ["1", "९", "A"] - /// for ch in chars { - /// print(ch, "-->", ch.decimalDigitValue) - /// } - /// // Prints: - /// // 1 --> Optional(1) - /// // ९ --> Optional(9) - /// // A --> nil - public var decimalDigitValue: Int? { get } - -} - -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar is considered - /// a decimal digit. - /// - /// Any Unicode scalar that has a `numericType` property equal to `.decimal` - /// is considered a decimal digit. This includes the digits from the ASCII - /// range, from the _Halfwidth and Fullwidth Forms_ Unicode block, as well - /// as digits in some scripts, like `DEVANAGARI DIGIT NINE` (U+096F). - public var isDecimalDigit: Bool { get } -} -``` - -`\D` matches the inverse of `\d`. - -*TBD*: [SE-0221: Character Properties][charprops] did not define equivalent API on `Unicode.Scalar`, as it was itself an extension of single `Unicode.Scalar.Properties`. Since we're defining additional classifications formed from algebraic formulations of properties, it may make sense to put API such as `decimalDigitValue` on `Unicode.Scalar` as well as back-porting other API from `Character` (e.g. `hexDigitValue`). We'd like to discuss this with the community. - -*TBD*: `Character.isHexDigit` is currently constrained to the subset of decimal digits that are followed by encodings of Latin letters `A-F` in various forms (all 6 of them... thanks Unicode). We could consider extending this to be a superset of `isDecimalDigit` by allowing and producing values for all decimal digits, one would just have to use the Latin letters to refer to values greater than `9`. We'd like to discuss this with the community. - -_
Rationale_ - -Unicode's recommended definition for `\d` is its [numeric type][numerictype] of "Decimal" in contrast to "Digit". It is specifically restricted to sets of ascending contiguously-encoded scalars in a decimal radix positional numeral system. Thus, it excludes "digits" such as superscript numerals from its [definition][derivednumeric] and is a proper subset of `Character.isWholeNumber`. - -We interpret Unicode's definition of the set of scalars, especially its requirement that scalars be encoded in ascending chains, to imply that this class is restricted to scalars which meaningfully encode base-10 digits. Thus, we choose to make this Character property _restrictive_, similar to `isHexDigit` and `isWholeNumber` and provide a way to access this value. - -It's possible we might add future properties to differentiate Unicode's non-decimal digits, but that is outside the scope of this pitch. - -
- -### Word characters: `\w`, `\W` - -We propose `\w` be named "word character" with the following definitions: - -```swift -extension Character { - /// A Boolean value indicating whether this character is considered - /// a "word" character. - /// - /// See `Unicode.Scalar.isWordCharacter`. - public var isWordCharacter: Bool { get } -} - -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar is considered - /// a "word" character. - /// - /// Any Unicode scalar that has one of the Unicode properties - /// `Alphabetic`, `Digit`, or `Join_Control`, or is in the - /// general category `Mark` or `Connector_Punctuation`. - public var isWordCharacter: Bool { get } -} -``` - -`\W` matches the inverse of `\w`. - -_
Rationale_ - -Word characters include more than letters, and we went with Unicode's recommended scalar semantics. We extend to grapheme clusters similarly to `Character.isLetter`, that is, subsequent (combining) scalars do not change the word-character-ness of the grapheme cluster. - -
- -### Whitespace and newlines: `\s`, `\S` (plus `\h`, `\H`, `\v`, `\V`, and `\R`) - -We propose `\s` be named "whitespace" with the following definitions: - -```swift -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar is considered - /// whitespace. - /// - /// All Unicode scalars with the derived `White_Space` property are - /// considered whitespace, including: - /// - /// - `CHARACTER TABULATION` (U+0009) - /// - `LINE FEED (LF)` (U+000A) - /// - `LINE TABULATION` (U+000B) - /// - `FORM FEED (FF)` (U+000C) - /// - `CARRIAGE RETURN (CR)` (U+000D) - /// - `NEWLINE (NEL)` (U+0085) - public var isWhitespace: Bool { get } -} -``` - -This definition matches the value of the existing `Unicode.Scalar.Properties.isWhitespace` property. Note that `Character.isWhitespace` already exists with the desired semantics, which is a grapheme cluster that begins with a whitespace Unicode scalar. - -We propose `\h` be named "horizontalWhitespace" with the following definitions: - -```swift -extension Character { - /// A Boolean value indicating whether this character is considered - /// horizontal whitespace. - /// - /// All characters with an initial Unicode scalar in the general - /// category `Zs`/`Space_Separator`, or the control character - /// `CHARACTER TABULATION` (U+0009), are considered horizontal - /// whitespace. - public var isHorizontalWhitespace: Bool { get } -} - -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar is considered - /// horizontal whitespace. - /// - /// All Unicode scalars with the general category - /// `Zs`/`Space_Separator`, along with the control character - /// `CHARACTER TABULATION` (U+0009), are considered horizontal - /// whitespace. - public var isHorizontalWhitespace: Bool { get } -} -``` - -We propose `\v` be named "verticalWhitespace" with the following definitions: - - -```swift -extension Character { - /// A Boolean value indicating whether this scalar is considered - /// vertical whitespace. - /// - /// All characters with an initial Unicode scalar in the general - /// category `Zl`/`Line_Separator`, or the following control - /// characters, are considered vertical whitespace (see below) - public var isVerticalWhitespace: Bool { get } -} - -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar is considered - /// vertical whitespace. - /// - /// All Unicode scalars with the general category - /// `Zl`/`Line_Separator`, along with the following control - /// characters, are considered vertical whitespace: - /// - /// - `LINE FEED (LF)` (U+000A) - /// - `LINE TABULATION` (U+000B) - /// - `FORM FEED (FF)` (U+000C) - /// - `CARRIAGE RETURN (CR)` (U+000D) - /// - `NEWLINE (NEL)` (U+0085) - public var isVerticalWhitespace: Bool { get } -} -``` - -Note that `Character.isNewline` already exists with the definition [required][lineboundary] by UTS\#18. *TBD:* Should we backport to `Unicode.Scalar`? - -`\S`, `\H`, and `\V` match the inverse of `\s`, `\h`, and `\v`, respectively. - -We propose `\R` include "verticalWhitespace" above with detection (and consumption) of the CR-LF sequence when applied to `Unicode.Scalar`. It is equivalent to `Character.isVerticalWhitespace` when applied to `Character`s. - -We are similarly not proposing any new API for `\R` until the stdlib has grapheme-breaking API over `Unicode.Scalar`. - -_
Rationale_ - -Note that "whitespace" is a term-of-art and is not correlated with visibility, which is a completely separate concept. - -We use Unicode's recommended scalar semantics for horizontal whitespace and extend that to grapheme semantics similarly to `Character.isWhitespace`. - -We use ICU's definition for vertical whitespace, similarly extended to grapheme clusters. - -
- -### Control characters: `\t`, `\r`, `\n`, `\f`, `\0`, `\e`, `\a`, `\b`, `\cX` - -We propose the following names and meanings for these escaped literals representing specific control characters: - -```swift -extension Character { - /// A horizontal tab character, `CHARACTER TABULATION` (U+0009). - public static var tab: Character { get } - - /// A carriage return character, `CARRIAGE RETURN (CR)` (U+000D). - public static var carriageReturn: Character { get } - - /// A line feed character, `LINE FEED (LF)` (U+000A). - public static var lineFeed: Character { get } - - /// A form feed character, `FORM FEED (FF)` (U+000C). - public static var formFeed: Character { get } - - /// A NULL character, `NUL` (U+0000). - public static var nul: Character { get } - - /// An escape control character, `ESC` (U+001B). - public static var escape: Character { get } - - /// A bell character, `BEL` (U+0007). - public static var bell: Character { get } - - /// A backspace character, `BS` (U+0008). - public static var backspace: Character { get } - - /// A combined carriage return and line feed as a single character denoting - // end-of-line. - public static var carriageReturnLineFeed: Character { get } - - /// Returns a control character with the given value, Control-`x`. - /// - /// This method returns a value only when you pass a letter in - /// the ASCII range as `x`: - /// - /// if let ch = Character.control("G") { - /// print("'ch' is a bell character", ch == Character.bell) - /// } else { - /// print("'ch' is not a control character") - /// } - /// // Prints "'ch' is a bell character: true" - /// - /// - Parameter x: An upper- or lowercase letter to derive - /// the control character from. - /// - Returns: Control-`x` if `x` is in the pattern `[a-zA-Z]`; - /// otherwise, `nil`. - public static func control(_ x: Unicode.Scalar) -> Character? -} - -extension Unicode.Scalar { - /// Same as above, producing Unicode.Scalar, except for CR-LF... -} -``` - -We also propose `isControl` properties with the following definitions: - -```swift -extension Character { - /// A Boolean value indicating whether this character represents - /// a control character. - /// - /// Control characters are a single Unicode scalar with the - /// general category `Cc`/`Control` or the CR-LF pair (`\r\n`). - public var isControl: Bool { get } -} - -extension Unicode.Scalar { - /// A Boolean value indicating whether this scalar represents - /// a control character. - /// - /// Control characters have the general category `Cc`/`Control`. - public var isControl: Bool { get } -} -``` - -*TBD*: Should we have a CR-LF static var on `Unicode.Scalar` that produces a value of type `Character`? - - -_
Rationale_ - -This approach simplifies the use of some common control characters, while making the rest available through a method call. - -
- - - -### Unicode named values and properties: `\N`, `\p`, `\P` - -`\N{NAME}` matches a Unicode scalar value with the specified name. `\p{PROPERTY}` and `\p{PROPERTY=VALUE}` match a Unicode scalar value with the given Unicode property (and value, if given). - -While most Unicode-defined properties can only match at the Unicode scalar level, some are defined to match an extended grapheme cluster. For example, `/\p{RGI_Emoji_Flag_Sequence}/` will match any flag emoji character, which are composed of two Unicode scalar values. - -`\P{...}` matches the inverse of `\p{...}`. - -Most of this is already present inside `Unicode.Scalar.Properties`, and we propose to round it out with anything missing, e.g. script and script extensions. (API is _TBD_, still working on it.) - -Even though we are not proposing any `Character`-based API, we'd like to discuss with the community whether or how to extend them to grapheme clusters. Some options: - -- Forbid in any grapheme-cluster semantic mode -- Match only single-scalar grapheme clusters with the given property -- Match any grapheme cluster that starts with the given property -- Something more-involved such as per-property reasoning - - -### POSIX character classes: `[:NAME:]` - -We propose that POSIX character classes be prefixed with "posix" in their name with APIs for testing membership of `Character`s and `Unicode.Scalar`s. `Unicode.Scalar.isASCII` and `Character.isASCII` already exist and can satisfy `[:ascii:]`, and can be used in combination with new members like `isDigit` to represent individual POSIX character classes. Alternatively, we could introduce an option-set-like `POSIXCharacterClass` and `func isPOSIX(_:POSIXCharacterClass)` since POSIX is a fully defined standard. This would cut down on the amount of API noise directly visible on `Character` and `Unicode.Scalar` significantly. We'd like some discussion the the community here, noting that this will become clearer as more of the string processing overview takes shape. - -POSIX's character classes represent concepts that we'd like to define at all semantic levels. We propose the following definitions, some of which are covered elsewhere in this pitch and some of which already exist today. Some Character definitions are *TBD* and we'd like more discussion with the community. - - -| POSIX class | API name | `Character` | `Unicode.Scalar` | POSIX mode value | -|-------------|----------------------|-----------------------|-------------------------------|-------------------------------| -| `[:lower:]` | lowercase | (exists) | `\p{Lowercase}` | `[a-z]` | -| `[:upper:]` | uppercase | (exists) | `\p{Uppercase}` | `[A-Z]` | -| `[:alpha:]` | alphabetic | (exists: `.isLetter`) | `\p{Alphabetic}` | `[A-Za-z]` | -| `[:alnum:]` | alphaNumeric | TBD | `[\p{Alphabetic}\p{Decimal}]` | `[A-Za-z0-9]` | -| `[:word:]` | wordCharacter | (pitched) | (pitched) | `[[:alnum:]_]` | -| `[:digit:]` | decimalDigit | (pitched) | (pitched) | `[0-9]` | -| `[:xdigit:]`| hexDigit | (exists) | `\p{Hex_Digit}` | `[0-9A-Fa-f]` | -| `[:punct:]` | punctuation | (exists) | (port from `Character`) | `[-!"#%&'()*,./:;?@[\\\]_{}]` | -| `[:blank:]` | horizontalWhitespace | (pitched) | (pitched) | `[ \t]` | -| `[:space:]` | whitespace | (exists) | `\p{Whitespace}` | `[ \t\n\r\f\v]` | -| `[:cntrl:]` | control | (pitched) | (pitched) | `[\x00-\x1f\x7f]` | -| `[:graph:]` | TBD | TBD | TBD | `[^ [:cntrl:]]` | -| `[:print:]` | TBD | TBD | TBD | `[[:graph:] ]` | - - -### Custom classes: `[...]` - -We propose that custom classes function just like set union. We propose that ranged-based custom character classes function just like `ClosedRange`. Thus, we are not proposing any additional API. - -That being said, providing grapheme cluster semantics is simultaneously obvious and tricky. A direct extension treats `[a-f]` as equivalent to `("a"..."f").contains()`. Strings (and thus Characters) are ordered for the purposes of efficiently maintaining programming invariants while honoring Unicode canonical equivalence. This ordering is _consistent_ but [linguistically meaningless][meaningless] and subject to implementation details such as whether we choose to normalize under NFC or NFD. - -```swift -let c: ClosedRange = "a"..."f" -c.contains("e") // true -c.contains("g") // false -c.contains("e\u{301}") // false, NFC uses precomposed é -c.contains("e\u{305}") // true, there is no precomposed e̅ -``` - -We will likely want corresponding `RangeExpression`-based API in the future and keeping consistency with ranges is important. - -We would like to discuss this problem with the community here. Even though we are not addressing regex literals specifically in this thread, it makes sense to produce suggestions for compilation errors or warnings. - -Some options: - -- Do nothing, embrace emergent behavior -- Warn/error for _any_ character class ranges -- Warn/error for character class ranges outside of a quasi-meaningful subset (e.g. ACII, albeit still has issues above) -- Warn/error for multiple-scalar grapheme clusters (albeit still has issues above) - - - -## Future Directions - -### Future API - -Library-extensible pattern matching will necessitate more types, protocols, and API in the future, many of which may involve character classes. This pitch aims to define names and semantics for exactly these kinds of API now, so that they can slot in naturally. - -### More classes or custom classes - -Future API might express custom classes or need more built-in classes. This pitch aims to establish rationale and precedent for a large number of character classes in Swift, serving as a basis that can be extended. - -### More lenient conversion APIs - -The proposed semantics for matching "digits" are broader than what the existing `Int(_:radix:)?` initializer accepts. It may be useful to provide additional initializers that can understand the whole breadth of characters matched by `\d`, or other related conversions. - - - - -[literals]: https://forums.swift.org/t/pitch-regular-expression-literals/52820 -[overview]: https://forums.swift.org/t/declarative-string-processing-overview/52459 -[charprops]: https://github.com/apple/swift-evolution/blob/master/proposals/0221-character-properties.md -[charpropsrationale]: https://github.com/apple/swift-evolution/blob/master/proposals/0221-character-properties.md#detailed-semantics-and-rationale -[canoneq]: https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence -[graphemes]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries -[meaningless]: https://forums.swift.org/t/declarative-string-processing-overview/52459/121 -[scalarprops]: https://github.com/apple/swift-evolution/blob/master/proposals/0211-unicode-scalar-properties.md -[ucd]: https://www.unicode.org/reports/tr44/tr44-28.html -[numerictype]: https://www.unicode.org/reports/tr44/#Numeric_Type -[derivednumeric]: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedNumericType.txt - - -[uts18]: https://unicode.org/reports/tr18/ -[proplist]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt -[pcre]: https://www.pcre.org/current/doc/html/pcre2pattern.html -[perl]: https://perldoc.perl.org/perlre -[raku]: https://docs.raku.org/language/regexes -[rust]: https://docs.rs/regex/1.5.4/regex/ -[python]: https://docs.python.org/3/library/re.html -[ruby]: https://ruby-doc.org/core-2.4.0/Regexp.html -[csharp]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expression-language-quick-reference -[icu]: https://unicode-org.github.io/icu/userguide/strings/regexp.html -[posix]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html -[oniguruma]: https://www.cuminas.jp/sdk/regularExpression.html -[go]: https://pkg.go.dev/regexp/syntax@go1.17.2 -[cplusplus]: https://www.cplusplus.com/reference/regex/ECMAScript/ -[ecmascript]: https://262.ecma-international.org/12.0/#sec-pattern-semantics -[re2]: https://github.com/google/re2/wiki/Syntax -[java]: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 8fa6096e8..f3c3ac6d1 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -43,13 +43,13 @@ Introduces `CustomMatchingRegexComponent`, which is a monadic-parser style inter ## Unicode for String Processing -- Draft: TBD +- [Draft](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/UnicodeForStringProcessing.md) - (Old) [Character class definitions](https://forums.swift.org/t/pitch-character-classes-for-string-processing/52920) Covers three topics: -- Proposes literal and DSL API for library-defined character classes, Unicode scripts and properties, and custom character classes. -- Proposes literal and DSL API for options that affect matching behavior. +- Proposes regex syntax and `RegexBuilder` API for options that affect matching behavior. +- Proposes regex syntax and `RegexBuilder` API for library-defined character classes, Unicode properties, and custom character classes. - Defines how Unicode scalar-based classes are extended to grapheme clusters in the different semantic and other matching modes. diff --git a/Documentation/Evolution/UnicodeForStringProcessing.md b/Documentation/Evolution/UnicodeForStringProcessing.md new file mode 100644 index 000000000..828d8f53c --- /dev/null +++ b/Documentation/Evolution/UnicodeForStringProcessing.md @@ -0,0 +1,872 @@ +# Unicode for String Processing + +Proposal: [SE-NNNN](NNNN-filename.md) +Authors: [Nate Cook](https://github.com/natecook1000), [Alejandro Alonso](https://github.com/Azoy) +Review Manager: TBD +Implementation: [apple/swift-experimental-string-processing][repo] +Status: **Draft** + + +## Introduction + +This proposal describes `Regex`'s rich Unicode support during regex matching, along with the character classes and options that define that behavior. + +## Motivation + +Swift's `String` type provides, by default, a view of `Character`s or [extended grapheme clusters][graphemes] whose comparison honors [Unicode canonical equivalence][canoneq]. Each character in a string can be composed of one or more Unicode scalar values, while still being treated as a single unit, equivalent to other ways of formulating the equivalent character: + +```swift +let str = "Cafe\u{301}" // "Café" +str == "Café" // true +str.dropLast() // "Caf" +str.last == "é" // true (precomposed e with acute accent) +str.last == "e\u{301}" // true (e followed by composing acute accent) +``` + +This default view is fairly novel. Most languages that support Unicode strings generally operate at the Unicode scalar level, and don't provide the same affordance for operating on a string as a collection of grapheme clusters. In Python, for example, Unicode strings report their length as the number of scalar values, and don't use canonical equivalence in comparisons: + +```python +cafe = u"Cafe\u0301" +len(cafe) # 5 +cafe == u"Café" # False +``` + +Existing regex engines follow this same model of operating at the Unicode scalar level. To match canonically equivalent characters, or have equivalent behavior between equivalent strings, you must normalize your string and regex to the same canonical format. + +```python +# Matches a four-element string +re.match(u"^.{4}$", cafe) # None +# Matches a string ending with 'é' +re.match(u".+é$", cafe) # None + +cafeComp = unicodedata.normalize("NFC", cafe) +re.match(u"^.{4}$", cafeComp) # +re.match(u".+é$", cafeComp) # +``` + +With Swift's string model, this behavior would surprising and undesirable — Swift's default regex semantics must match the semantics of a `String`. + +
Other engines + +Other regex engines match character classes (such as `\w` or `.`) at the Unicode scalar value level, or even the code unit level, instead of recognizing grapheme clusters as characters. When matching the `.` character class, other languages will only match the first part of an `"e\u{301}"` grapheme cluster. Some languages, like Perl, Ruby, and Java, support an additional `\X` metacharacter, which explicitly represents a single grapheme cluster. + +| Matching `"Cafe\u{301}"` | Pattern: `^Caf.` | Remaining | Pattern: `^Caf\X` | Remaining | +|---|---|---|---|---| +| C#, Rust, Go, Python | `"Cafe"` | `"´"` | n/a | n/a | +| NSString, Java, Ruby, Perl | `"Cafe"` | `"´"` | `"Café"` | `""` | + +Other than Java's `CANON_EQ` option, the vast majority of other languages and engines are not capable of comparing with canonical equivalence. + +
+ +## Proposed solution + +In a regex's simplest form, without metacharacters or special features, matching behaves like a test for equality. A string always matches a regex that simply contains the same characters. + +```swift +let str = "Cafe\u{301}" // "Café" +str.contains(/Café/) // true +``` + +From that point, small changes continue to comport with the element counting and comparison expectations set by `String`: + +```swift +str.contains(/Caf./) // true +str.contains(/.+é/) // true +str.contains(/.+e\u{301}/) // true +str.contains(/\w+é/) // true +``` + + +For compatibility with other regex engines and the flexibility to match at both `Character` and Unicode scalar level, you can switch between matching levels for an entire regex or within select portions. This powerful capability provides the expected default behavior when working with strings, while allowing you to drop down for Unicode scalar-specific matching. + +By default, literal characters and Unicode scalar values (e.g. `\u{301}`) are coalesced into characters in the same way as a normal string, as shown above. Metacharacters, like `.` and `\w`, and custom character classes each match a single element at the current matching level. + +For example, these matches fail, because by the time the parser encounters the "`\u{301}`" Unicode scalar literal, the full `"é"` character has been matched: + +```swift +str.contains(/Caf.\u{301}) // false - `.` matches "é" character +str.contains(/Caf\w\u{301}) // false - `\w` matches "é" character +str.contains(/.+\u{301}) // false - `.+` matches each character +``` + +Alternatively, we can drop down to use Unicode scalar semantics if we want to match specific Unicode sequences. For example, these regexes matches an `"e"` followed by any modifier with the specified parameters: + +```swift +str.contains(/e[\u{300}-\u{314}]/.matchingSemantics(.unicodeScalar)) +// true - matches an "e" followed by a Unicode scalar in the range U+0300 - U+0314 +str.contains(/e\p{Nonspacing Mark}/.matchingSemantics(.unicodeScalar)) +// true - matches an "e" followed by a Unicode scalar with general category "Nonspacing Mark" +``` + +Matching in Unicode scalar mode is analogous to comparing against a string's `UnicodeScalarView` — individual Unicode scalars are matched without combining them into characters or testing for canonical equivalence. + +```swift +str.contains(/Café/.matchingSemantics(.unicodeScalar)) +// false - "e\u{301}" doesn't match with /é/ +str.contains(/Cafe\u{301}/.matchingSemantics(.unicodeScalar)) +// true - "e\u{301}" matches with /e\u{301}/ +``` + +Swift's `Regex` follows the level 2 guidelines for Unicode support in regular expressions described in [Unicode Technical Standard #18][uts18], with support for Unicode character classes, canonical equivalence, grapheme cluster matching semantics, and level 2 word boundaries enabled by default. In addition to selecting the matching semantics, `Regex` provides options for selecting different matching behaviors, such as ASCII character classes or Unicode scalar semantics, which corresponds more closely with other regex engines. + +## Detailed design + +First, we'll discuss the options that let you control a regex's behavior, and then explore the character classes that define the your pattern. + +### Options + +Options can be enabled and disabled in two different ways: as part of [regex internal syntax][internals], or applied as methods when declaring a `Regex`. For example, both of these `Regex`es are declared with case insensitivity: + +```swift +let regex1 = /(?i)banana/ +let regex2 = Regex { + "banana" +}.ignoresCase()` +``` + +Note that the `ignoresCase()` is available on any type conforming to `RegexComponent`, which means that you can always use the more readable option-setting interface in conjunction with regex literals or run-time compiled `Regex`es: + +```swift +let regex3 = /banana/.ignoresCase() +``` + +Calling an option-setting method like `ignoresCase(_:)` acts like wrapping the callee in an option-setting group `(?:...)`. That is, while it sets the behavior for the callee, it doesn’t override options that are applied to more specific regions. In this example, the middle `"na"` in `"banana"` matches case-sensitively, despite the outer call to `ignoresCase()`: + +```swift +let regex4 = Regex { + "ba" + "na".ignoresCase(false) + "na" +} +.ignoresCase() + +"banana".contains(regex4) // true +"BAnaNA".contains(regex4) // true +"BANANA".contains(regex4) // false + +// Equivalent to: +let regex5 = /(?i)ba(?-i:na)na/ +``` + +All option APIs are provided on `RegexComponent`, so they can be called on a `Regex` instance, or on any component that you would use inside a `RegexBuilder` block when the `RegexBuilder` module is imported. + +The options that `Regex` supports are shown in the table below. Options that affect _matching behavior_ are supported through both regex syntax and APIs, while options that have _structural_ or _syntactic_ effects are only supported through regex syntax. + +| **Matching Behavior** | | | +|------------------------------|----------------|---------------------------| +| Case insensitivity | `(?i)` | `ignoresCase()` | +| Single-line mode | `(?s)` | `dotMatchesNewlines()` | +| Multi-line mode | `(?m)` | `anchorsMatchNewlines()` | +| ASCII-only character classes | `(?DSWP)` | `asciiOnlyDigits()`, etc | +| Unicode word boundaries | `(?w)` | `wordBoundaryKind(_:)` | +| Semantic level | `(?Xu)` | `matchingSemantics(_:)` | +| Repetition behavior | `(?U)` | `repetitionBehavior(_:)` | +| **Structural/Syntactic** | | | +| Extended syntax | `(?x)`,`(?xx)` | n/a | +| Named captures only | `(?n)` | n/a | +| Shared capture names | `(?J)` | n/a | + +#### Case insensitivity + +Regexes perform case sensitive comparisons by default. The `i` option or the `ignoresCase(_:)` method enables case insensitive comparison. + +```swift +let str = "Café" + +str.firstMatch(of: /CAFÉ/) // nil +str.firstMatch(of: /(?i)CAFÉ/) // "Café" +str.firstMatch(of: /(?i)cAfÉ/) // "Café" +``` + +Case insensitive matching uses case folding to ensure that canonical equivalence continues to operate as expected. + +**Regex syntax:** `(?i)...` or `(?i:...)` + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression that ignores casing when matching. + public func ignoresCase(_ ignoresCase: Bool = true) -> Regex +} +``` + +#### Single line mode (`.` matches newlines) + +The "any" metacharacter (`.`) matches any character in a string *except* newlines by default. With the `s` option enabled, `.` matches any character including newlines. + +```swift +let str = """ + <> + """ + +str.firstMatch(of: /<<.+>>/) // nil +str.firstMatch(of: /(?s)<<.+>>/) // "This string\nuses double-angle-brackets\nto group text." +``` + +This option also affects the behavior of `CharacterClass.any`, which is designed to match the behavior of the `.` regex literal component. + +**Regex syntax:** `(?s)...` or `(?s...)` + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression where the start and end of input + /// anchors (`^` and `$`) also match against the start and end of a line. + public func dotMatchesNewlines(_ dotMatchesNewlines: Bool = true) -> Regex +} +``` + +#### Multiline mode + +By default, the start and end anchors (`^` and `$`) match only the beginning and end of a string. With the `m` or the option, they also match the beginning and end of each line. + +```swift +let str = """ + abc + def + ghi + """ + +str.firstMatch(of: /^abc/) // "abc" +str.firstMatch(of: /^abc$/) // nil +str.firstMatch(of: /(?m)^abc$/) // "abc" + +str.firstMatch(of: /^def/) // nil +str.firstMatch(of: /(?m)^def$/) // "def" +``` + +This option applies only to anchors used in a regex literal. The anchors defined in `RegexBuilder` are specific about matching at the start/end of the input or the line, and therefore do not correspond directly with the `^` and `$` literal anchors. + +```swift +str.firstMatch(of: Regex { Anchor.startOfInput ; "def" }) // nil +str.firstMatch(of: Regex { Anchor.startOfLine ; "def" }) // "def" +``` + +**Regex syntax:** `(?m)...` or `(?m...)` + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression where the start and end of input + /// anchors (`^` and `$`) also match against the start and end of a line. + public func anchorsMatchLineEndings(_ matchLineEndings: Bool = true) -> Regex +} +``` + +#### ASCII-only character classes + +With one or more of these options enabled, the default character classes match only ASCII values instead of the full Unicode range of characters. Four options are included in this group: + +* `D`: Match only ASCII members for `\d`, `\p{Digit}`, `[:digit:]`, and the `CharacterClass.digit`. +* `S`: Match only ASCII members for `\s`, `\p{Space}`, `[:space:]`. +* `W`: Match only ASCII members for `\w`, `\p{Word}`, `[:word:]`, `\b`, `CharacterClass.word`, and `Anchor.wordBoundary`. +* `P`: Match only ASCII members for all POSIX properties (including `digit`, `space`, and `word`). + +**Regex syntax:** `(?DSWP)...` or `(?DSWP...)` + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression that only matches ASCII characters as digits. + public func asciiOnlyDigits(_ asciiOnly: Bool = true) -> Regex + + /// Returns a regular expression that only matches ASCII characters as space + /// characters. + public func asciiOnlyWhitespace(_ asciiOnly: Bool = true) -> Regex + + /// Returns a regular expression that only matches ASCII characters as "word + /// characters". + public func asciiOnlyWordCharacters(_ asciiOnly: Bool = true) -> Regex + + /// Returns a regular expression that only matches ASCII characters when + /// matching character classes. + public func asciiOnlyCharacterClasses(_ asciiOnly: Bool = true) -> Regex +} +``` + +#### Unicode word boundaries + +By default, matching word boundaries with the `\b` and `Anchor.wordBoundary` anchors uses Unicode _default word boundaries,_ specified as [Unicode level 2 regular expression support][level2-word-boundaries]. + +Disabling the `w` option switches to _[simple word boundaries][level1-word-boundaries],_ finding word boundaries at points in the input where `\b\B` or `\B\b` match. Depending on the other matching options that are enabled, this may be more compatible with the behavior other regex engines. + +As shown in this example, the default matching behavior finds the whole first word of the string, while the match with simple word boundaries stops at the apostrophe: + +```swift +let str = "Don't look down!" + +str.firstMatch(of: /D\S+\b/) // "Don't" +str.firstMatch(of: /(?-w)D\S+\b/) // "Don" +``` + +You can see more differences between level 1 and level 2 word boundaries in the following table: + +| Example | Level 1 | Level 2 | +|---------------------|---------------------------------|-------------------------------------------| +| I can't do that. | ["I", "can", "t", "do", "that"] | ["I", "can't", "do", "that", "."] | +| 🔥😊👍 | ["🔥😊👍"] | ["🔥", "😊", "👍"] | +| 👩🏻👶🏿👨🏽🧑🏾👩🏼 | ["👩🏻👶🏿👨🏽🧑🏾👩🏼"] | ["👩🏻", "👶🏿", "👨🏽", "🧑🏾", "👩🏼"] | +| 🇨🇦🇺🇸🇲🇽 | ["🇨🇦🇺🇸🇲🇽"] | ["🇨🇦", "🇺🇸", "🇲🇽"] | +| 〱㋞ツ | ["〱", "㋞", "ツ"] | ["〱㋞ツ"] | +| hello〱㋞ツ | ["hello〱", "㋞", "ツ"] | ["hello", "〱㋞ツ"] | +| 나는 Chicago에 산다 | ["나는", "Chicago에", "산다"] | ["나", "는", "Chicago", "에", "산", "다"] | +| 眼睛love食物 | ["眼睛love食物"] | ["眼", "睛", "love", "食", "物"] | +| 아니ㅋㅋㅋ네 | ["아니ㅋㅋㅋ네"] | ["아", "니", "ㅋㅋㅋ", "네"] | +| Re:Zero | ["Re", "Zero"] | ["Re:Zero"] | +| \u{d}\u{a} | ["\u{d}", "\u{a}"] | ["\u{d}\u{a}"] | +| €1 234,56 | ["1", "234", "56"] | ["€", "1", "234,56"] | + + +**Regex syntax:** `(?-w)...` or `(?-w...)` + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression that uses the specified word boundary algorithm. + /// + /// A simple word boundary is a position in the input between two characters + /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input + /// and `\w` character. Word boundaries therefore depend on the option-defined + /// behavior of `\w`. + /// + /// The default word boundaries use a Unicode algorithm that handles some cases + /// better than simple word boundaries, such as words with internal + /// punctuation, changes in script, and Emoji. + public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex +} + +public struct RegexWordBoundaryKind: Hashable { + /// A word boundary algorithm that implements the "simple word boundary" + /// Unicode recommendation. + /// + /// A simple word boundary is a position in the input between two characters + /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input + /// and a `\w` character. Word boundaries therefore depend on the option- + /// defined behavior of `\w`. + public static var unicodeLevel1: Self { get } + + /// A word boundary algorithm that implements the "default word boundary" + /// Unicode recommendation. + /// + /// Default word boundaries use a Unicode algorithm that handles some cases + /// better than simple word boundaries, such as words with internal + /// punctuation, changes in script, and Emoji. + public static var unicodeLevel2: Self { get } +} +``` + +#### Matching semantic level + +When matching with grapheme cluster semantics (the default), metacharacters like `.` and `\w`, custom character classes, and character class instances like `.any` match a grapheme cluster when possible, corresponding with the default string representation. In addition, matching with grapheme cluster semantics compares characters using their canonical representation, corresponding with the way comparing strings for equality works. + +When matching with Unicode scalar semantics, metacharacters and character classes always match a single Unicode scalar value, even if that scalar comprises part of a grapheme cluster. + +These semantic levels lead to different results, especially when working with strings that have decomposed characters. In the following example, `queRegex` matches any 3-character string that begins with `"q"`. + +```swift +let composed = "qué" +let decomposed = "que\u{301}" + +let queRegex = /^q..$/ + +print(composed.contains(queRegex)) +// Prints "true" +print(decomposed.contains(queRegex)) +// Prints "true" +``` + +When using Unicode scalar semantics, however, the regex only matches the composed version of the string, because each `.` matches a single Unicode scalar value. + +```swift +let queRegexScalar = queRegex.matchingSemantics(.unicodeScalar) +print(composed.contains(queRegexScalar)) +// Prints "true" +print(decomposed.contains(queRegexScalar)) +// Prints "false" +``` + +With grapheme cluster semantics, a grapheme cluster boundary is naturally enforced at the start and end of the match and every capture group. Matching with Unicode scalar semantics, on the other hand, including using the `\O` metacharacter or `.anyUnicodeScalar` character class, can yield string indices that aren't aligned to character boundaries. Take care when using indices that aren't aligned with grapheme cluster boundaries, as they may have to be rounded to a boundary if used in a `String` instance. + +```swift +let family = "👨‍👨‍👧‍👦 is a family" + +// Grapheme-cluster mode: Yields a character +let firstCharacter = /^./ +let characterMatch = family.firstMatch(of: firstCharacter)!.output +print(characterMatch) +// Prints "👨‍👨‍👧‍👦" + +// Unicode-scalar mode: Yields only part of a character +let firstUnicodeScalar = /^./.matchingSemantics(.unicodeScalar) +let unicodeScalarMatch = family.firstMatch(of: firstUnicodeScalar)!.output +print(unicodeScalarMatch) +// Prints "👨" + +// The end of `unicodeScalarMatch` is not aligned on a character boundary +print(unicodeScalarMatch.endIndex == family.index(after: family.startIndex)) +// Prints "false" +``` + +When a regex proceeds with grapheme cluster semantics from a position that _isn't_ grapheme cluster aligned, it attempts to match the partial grapheme cluster that starts at that point. In the first call to `contains(_:)` below, `\O` matches a single Unicode scalar value, as shown above, and then the engine tries to match `\s` against the remainder of the family emoji character. Because that character is not whitespace, the match fails. The second call uses `\X`, which matches the entire emoji character, and then successfully matches the following space. + +```swift +// \O matches a single Unicode scalar, whatever the current semantics +family.contains(/^\O\s/)) // false + +// \X matches a single character, whatever the current semantics +family.contains(/^\X\s/) // true +``` + +**Regex syntax:** `(?X)...` or `(?X...)` for grapheme cluster semantics, `(?u)...` or `(?u...)` for Unicode scalar semantics. + +**`RegexBuilder` API:** + +```swift +extension RegexComponent { + /// Returns a regular expression that matches with the specified semantic + /// level. + public func matchingSemantics(_ semanticLevel: RegexSemanticLevel) -> Regex +} + +public struct RegexSemanticLevel: Hashable { + /// Match at the default semantic level of a string, where each matched + /// element is a `Character`. + public static var graphemeCluster: RegexSemanticLevel + + /// Match at the semantic level of a string's `UnicodeScalarView`, where each + /// matched element is a `UnicodeScalar` value. + public static var unicodeScalar: RegexSemanticLevel +} +``` + +#### Default repetition behavior + +Regex quantifiers (`+`, `*`, and `?`) match eagerly by default when they repeat, such that they match the longest possible substring. Appending `?` to a quantifier makes it reluctant, instead, so that it matches the shortest possible substring. + +```swift +let str = "A value." + +// By default, the '+' quantifier is eager, and consumes as much as possible. +str.firstMatch(of: /<.+>/) // "A value." + +// Adding '?' makes the '+' quantifier reluctant, so that it consumes as little as possible. +str.firstMatch(of: /<.+?>/) // "" +``` + +The `U` option toggles the "eagerness" of quantifiers, so that quantifiers are reluctant by default, and only become eager when `?` is added to the quantifier. + +```swift +// '(?U)' toggles the eagerness of quantifiers: +str.firstMatch(of: /(?U)<.+>/) // "" +str.firstMatch(of: /(?U)<.+?>/) // "A value." +``` + +**Regex syntax:** `(?U)...` or `(?U...)` + +**`RegexBuilder` API:** + +The `repetitionBehavior(_:)` method lets you set the default behavior for all quantifiers that don't explicitly provide their own behavior. For example, you can make all quantifiers behave possessively, eliminating any quantification-caused backtracking. + +```swift +extension RegexComponent { + /// Returns a regular expression where quantifiers are reluctant by default + /// instead of eager. + public func repetitionBehavior(_ behavior: RegexRepetitionBehavior) -> Regex +} + +public struct RegexRepetitionBehavior { + /// Match as much of the input string as possible, backtracking when + /// necessary. + public static var eager: RegexRepetitionBehavior { get } + + /// Match as little of the input string as possible, expanding the matched + /// region as necessary to complete a match. + public static var reluctant: RegexRepetitionBehavior { get } + + /// Match as much of the input string as possible, performing no backtracking. + public static var possessive: RegexRepetitionBehavior { get } +} +``` + +In order for this option to have the same effect on regexes built with `RegexBuilder` as with regex syntax, the `RegexBuilder` quantifier APIs are amended to have an `nil`-defaulted optional `behavior` parameter. For example: + +```swift +extension OneOrMore { + public init( + _ behavior: RegexRepetitionBehavior? = nil, + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0), Component.Output == (W, C0) +} +``` + +When you pass `nil`, the quantifier uses the default behavior as set by this option (either eager or reluctant). If an explicit behavior is passed, that behavior is used regardless of the default. + + +--- + +### Character Classes + +We propose the following definitions for regex character classes, along with a `CharacterClass` type as part of the `RegexBuilder` module, to encapsulate and simplify character class usage within builder-style regexes. + +The two regexes defined in this example will match the same inputs, looking for one or more word characters followed by up to three digits, optionally separated by a space: + +```swift +let regex1 = /\w+\s?\d{,3}/ +let regex2 = Regex { + OneOrMore(.word) + Optionally(.whitespace) + Repeat(.digit, ...3) +} +``` + +You can build custom character classes by combining regex-defined classes with individual characters or ranges, or by performing common set operations such as subtracting or negating a character class. + + +#### “Any” + +The simplest character class, representing **any character**, is written as `.` or `CharacterClass.any` and is also referred to as the "dot" metacharacter. This class always matches a single `Character` or Unicode scalar value, depending on the matching semantic level. This class excludes newlines, unless "single line mode" is enabled (see section above). + +In the following example, using grapheme cluster semantics, a dot matches a grapheme cluster, so the decomposed é is treated as a single value: + +```swift +"Cafe\u{301}".contains(/C.../) +// true +``` + +For this example, using Unicode scalar semantics, a dot matches only a single Unicode scalar value, so the combining marks don't get grouped with the commas before them: + +```swift +let data = "\u{300},\u{301},\u{302},\u{303},..." +for match in data.matches(of: /(.),/.matchingSemantics(.unicodeScalar)) { + print(match.1) +} +// Prints: +// ̀ +// ́ +// ̂ +// ... +``` + +`Regex` also provides ways to select a specific level of "any" matching, without needing to change semantic levels. + +- The **any grapheme cluster** character class is written as `\X` or `CharacterClass.anyGraphemeCluster`, and matches from the current location up to the next grapheme cluster boundary. This includes matching newlines, regardless of any option settings. This metacharacter is equivalent to the regex syntax `(?s-u:.)`. +- The **any Unicode scalar** character class is written as `\O` or `CharacterClass.anyUnicodeScalar`, and matches exactly one Unicode scalar value at the current location. This includes matching newlines, regardless of any option settings, but only the first scalar in an `\r\n` cluster. This metacharacter is equivalent to the regex syntax `(?su:.)`. + +#### Digits + +The **decimal digit** character class is matched by `\d` or `CharacterClass.digit`. Both regexes in this example match one or more decimal digits followed by a colon: + +```swift +let regex1 = /\d+:/ +let regex2 = Regex { + OneOrMore(.digit) + ":" +} +``` + +_Unicode scalar semantics:_ Matches a Unicode scalar that has a `numericType` property equal to `.decimal`. This includes the digits from the ASCII range, from the _Halfwidth and Fullwidth Forms_ Unicode block, as well as digits in some scripts, like `DEVANAGARI DIGIT NINE` (U+096F). This corresponds to the general category `Decimal_Number`. + +_Grapheme cluster semantics:_ Matches a character made up of a single Unicode scalar that fits the decimal digit criteria above. + +_ASCII mode_: Matches a Unicode scalar in the range `0` to `9`. + + +To invert the decimal digit character class, use `\D` or `CharacterClass.digit.inverted`. + + +The **hexadecimal digit** character class is matched by `CharacterClass.hexDigit`. + +_Unicode scalar semantics:_ Matches a decimal digit, as described above, or an uppercase or small `A` through `F` from the _Halfwidth and Fullwidth Forms_ Unicode block. Note that this is a broader class than described by the `UnicodeScalar.properties.isHexDigit` property, as that property only include ASCII and fullwidth decimal digits. + +_Grapheme cluster semantics:_ Matches a character made up of a single Unicode scalar that fits the hex digit criteria above. + +_ASCII mode_: Matches a Unicode scalar in the range `0` to `9`, `a` to `f`, or `A` to `F`. + +To invert the hexadecimal digit character class, use `CharacterClass.hexDigit.inverted`. + +*
Rationale* + +Unicode's recommended definition for `\d` is its [numeric type][numerictype] of "Decimal" in contrast to "Digit". It is specifically restricted to sets of ascending contiguously-encoded scalars in a decimal radix positional numeral system. Thus, it excludes "digits" such as superscript numerals from its [definition][derivednumeric] and is a proper subset of `Character.isWholeNumber`. + +We interpret Unicode's definition of the set of scalars, especially its requirement that scalars be encoded in ascending chains, to imply that this class is restricted to scalars which meaningfully encode base-10 digits. Thus, we choose to make the grapheme cluster interpretation *restrictive*. + +
+ + +#### "Word" characters + +The **word** character class is matched by `\w` or `CharacterClass.word`. This character class and its name are essentially terms of art within regexes, and represents part of a notional "word". Note that, by default, this is distinct from the algorithm for identifying word boundaries. + +_Unicode scalar semantics:_ Matches a Unicode scalar that has one of the Unicode properties `Alphabetic`, `Digit`, or `Join_Control`, or is in the general category `Mark` or `Connector_Punctuation`. + +_Grapheme cluster semantics:_ Matches a character that begins with a Unicode scalar value that fits the criteria above. + +_ASCII mode_: Matches the numbers `0` through `9`, lowercase and uppercase `A` through `Z`, and the underscore (`_`). + +To invert the word character class, use `\W` or `CharacterClass.word.inverted`. + +*
Rationale* + +Word characters include more than letters, and we went with Unicode's recommended scalar semantics. Following the Unicode recommendation that nonspacing marks remain with their base characters, we extend to grapheme clusters similarly to `Character.isLetter`. That is, combining scalars do not change the word-character-ness of the grapheme cluster. + +
+ + +#### Whitespace and newlines + +The **whitespace** character class is matched by `\s` and `CharacterClass.whitespace`. + +_Unicode scalar semantics:_ Matches a Unicode scalar that has the Unicode properties `Whitespace`, including a space, a horizontal tab (U+0009), `LINE FEED (LF)` (U+000A), `LINE TABULATION` (U+000B), `FORM FEED (FF)` (U+000C), `CARRIAGE RETURN (CR)` (U+000D), and `NEWLINE (NEL)` (U+0085). Note that under Unicode scalar semantics, `\s` only matches the first scalar in a `CR`+`LF` pair. + +_Grapheme cluster semantics:_ Matches a character that begins with a `Whitespace` Unicode scalar value. This includes matching a `CR`+`LF` pair. + +_ASCII mode_: Matches characters that both ASCII and fit the criteria given above. The current matching semantics dictate whether a `CR`+`LF` pair is matched in ASCII mode. + +The **horizontal whitespace** character class is matched by `\h` and `CharacterClass.horizontalWhitespace`. + +_Unicode scalar semantics:_ Matches a Unicode scalar that has the Unicode general category `Zs`/`Space_Separator` as well as a horizontal tab (U+0009). + +_Grapheme cluster semantics:_ Matches a character that begins with a Unicode scalar value that fits the criteria above. + +_ASCII mode_: Matches either a space (`" "`) or a horizontal tab. + +The **vertical whitespace** character class is matched by `\v` and `CharacterClass.verticalWhitespace`. Additionally, `\R` and `CharacterClass.newline` provide a way to include the `CR`+`LF` pair, even when matching with Unicode scalar semantics. + +_Unicode scalar semantics:_ Matches a Unicode scalar that has the Unicode general category `Zl`/`Line_Separator` as well as any of the following control characters: `LINE FEED (LF)` (U+000A), `LINE TABULATION` (U+000B), `FORM FEED (FF)` (U+000C), `CARRIAGE RETURN (CR)` (U+000D), and `NEWLINE (NEL)` (U+0085). Only when specified as `\R` or `CharacterClass.newline` does this match the whole `CR`+`LF` pair. + +_Grapheme cluster semantics:_ Matches a character that begins with a Unicode scalar value that fits the criteria above. + +_ASCII mode_: Matches any of the four ASCII control characters listed above. The current matching semantics dictate whether a `CR`+`LF` pair is matched in ASCII mode. + +To invert these character classes, use `\S`, `\H`, and `\V`, respectively, or the `inverted` property on a `CharacterClass` instance. + +
Rationale + +Note that "whitespace" is a term-of-art and is not correlated with visibility, which is a completely separate concept. + +We use Unicode's recommended scalar semantics for horizontal and vertical whitespace, extended to grapheme clusters as in the existing `Character.isWhitespace` property. + +
+ + +#### Unicode properties + +Character classes that match **Unicode properties** are written as `\p{PROPERTY}` or `\p{PROPERTY=VALUE}`, as described in the [Run-time Regex Construction proposal][internals-properties]. + +While most Unicode properties are only defined at the scalar level, some are defined to match an extended grapheme cluster. For example, `\p{RGI_Emoji_Flag_Sequence}` will match any flag emoji character, which are composed of two Unicode scalar values. Such property classes will match multiple scalars, even when matching with Unicode scalar semantics. + +Unicode property matching is extended to `Character`s with a goal of consistency with other regex character classes. For `\p{Decimal}` and `\p{Hex_Digit}`, only single-scalar `Character`s can match, for the reasons described in that section, above. For all other Unicode property classes, matching `Character`s can comprise multiple scalars, as long as the first scalar matches the property. + +To invert a Unicode property character class, use `\P{...}`. + + +#### POSIX character classes: `[:NAME:]` + +**POSIX character classes** represent concepts that we'd like to define at all semantic levels. We propose the following definitions, some of which have been described above. When matching with grapheme cluster semantics, Unicode properties are extended to `Character`s as descrived in the rationale above, and as shown in the table below. That is, for POSIX class `[:word:]`, any `Character` that starts with a matching scalar is a match, while for `[:digit:]`, a matching `Character` must only comprise a single Unicode scalar value. + +| POSIX class | Unicode property class | Character behavior | ASCII mode value | +|--------------|-----------------------------------|----------------------|-------------------------------| +| `[:lower:]` | `\p{Lowercase}` | starts-with | `[a-z]` | +| `[:upper:]` | `\p{Uppercase}` | starts-with | `[A-Z]` | +| `[:alpha:]` | `\p{Alphabetic}` | starts-with | `[A-Za-z]` | +| `[:alnum:]` | `[\p{Alphabetic}\p{Decimal}]` | starts-with | `[A-Za-z0-9]` | +| `[:word:]` | See \* below | starts-with | `[[:alnum:]_]` | +| `[:digit:]` | `\p{DecimalNumber}` | single-scalar | `[0-9]` | +| `[:xdigit:]` | `\p{Hex_Digit}` | single-scalar | `[0-9A-Fa-f]` | +| `[:punct:]` | `\p{Punctuation}` | starts-with | `[-!"#%&'()*,./:;?@[\\\]{}]` | +| `[:blank:]` | `[\p{Space_Separator}\u{09}]` | starts-with | `[ \t]` | +| `[:space:]` | `\p{Whitespace}` | starts-with | `[ \t\n\r\f\v]` | +| `[:cntrl:]` | `\p{Control}` | starts-with | `[\x00-\x1f\x7f]` | +| `[:graph:]` | See \*\* below | starts-with | `[^ [:cntrl:]]` | +| `[:print:]` | `[[:graph:][:blank:]--[:cntrl:]]` | starts-with | `[[:graph:] ]` | + +\* The Unicode scalar property definition for `[:word:]` is `[\p{Alphanumeric}\p{Mark}\p{Join_Control}\p{Connector_Punctuation}]`. +\*\* The Unicode scalar property definition for `[:cntrl:]` is `[^\p{Space}\p{Control}\p{Surrogate}\p{Unassigned}]`. + +#### Custom classes + +Custom classes function as the set union of their individual components, whether those parts are individual characters, individual Unicode scalar values, ranges, Unicode property classes or POSIX classes, or other custom classes. + +- Individual characters and scalars will be tested using the same behavior as if they were listed in an alternation. That is, a custom character class like `[abc]` is equivalent to `(a|b|c)` under the same options and modes. +- When in grapheme cluster semantic mode, ranges of characters will test for membership using NFD form (or NFKD when performing caseless matching). This differs from how a `ClosedRange` would operate its `contains` method, since that depends on `String`'s `Comparable` conformance, but the decomposed comparison better aligns with the canonical equivalence matching used elsewhere in `Regex`. +- A custom character class will match a maximum of one `Character` or `UnicodeScalar`, depending on the matching semantic level. This means that a custom character class with extended grapheme cluster members may not match anything while using scalar semantics. + +Inside regexes, custom classes are enclosed in square brackets `[...]`, and can be nested or combined using set operators like `&&`. For more detail, see the [Run-time Regex Construction proposal][internals-charclass]. + +With `RegexBuilder`'s `CharacterClass` type, you can use built-in character classes with ranges and groups of characters. For example, to parse a valid octodecimal number, you could define a custom character class that combines `.digit` with a range of characters. + +```swift +let octoDecimalRegex: Regex<(Substring, Int?)> = Regex { + let charClass = CharacterClass(.digit, "a"..."h").ignoresCase() + Capture { + OneOrMore(charClass) + } transform: { Int($0, radix: 18) } +} +``` + +The full `CharacterClass` API is as follows: + +```swift +public struct CharacterClass: RegexComponent { + public var regex: Regex { get } + + public var inverted: CharacterClass { get } +} + +extension RegexComponent where Self == CharacterClass { + public static var any: CharacterClass { get } + + public static var anyGraphemeCluster: CharacterClass { get } + + public static var anyUnicodeScalar: CharacterClass { get } + + public static var digit: CharacterClass { get } + + public static var hexDigit: CharacterClass { get } + + public static var word: CharacterClass { get } + + public static var whitespace: CharacterClass { get } + + public static var horizontalWhitespace: CharacterClass { get } + + public static var newlineSequence: CharacterClass { get } + + public static var verticalWhitespace: CharacterClass { get } +} + +extension RegexComponent where Self == CharacterClass { + /// Returns a character class that matches any character in the given string + /// or sequence. + public static func anyOf(_ s: S) -> CharacterClass + where S.Element == Character + + /// Returns a character class that matches any unicode scalar in the given + /// sequence. + public static func anyOf(_ s: S) -> CharacterClass + where S.Element == UnicodeScalar +} + +// Unicode properties +extension CharacterClass { + /// Returns a character class that matches elements in the given Unicode + /// general category. + public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass +} + +// Set algebra methods +extension CharacterClass { + /// Creates a character class that combines the given classes in a union. + public init(_ first: CharacterClass, _ rest: CharacterClass...) + + /// Returns a character class from the union of this class and the given class. + public func union(_ other: CharacterClass) -> CharacterClass + + /// Returns a character class from the intersection of this class and the given class. + public func intersection(_ other: CharacterClass) -> CharacterClass + + /// Returns a character class by subtracting the given class from this class. + public func subtracting(_ other: CharacterClass) -> CharacterClass + + /// Returns a character class matching elements in one or the other, but not both, + /// of this class and the given class. + public func symmetricDifference(_ other: CharacterClass) -> CharacterClass +} + +/// Range syntax for characters in `CharacterClass`es. +public func ...(lhs: Character, rhs: Character) -> CharacterClass + +/// Range syntax for unicode scalars in `CharacterClass`es. +@_disfavoredOverload +public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass +``` + +## Source compatibility + +Everything in this proposal is additive, and has no compatibility effect on existing source code. + +## Effect on ABI stability + +Everything in this proposal is additive, and has no effect on existing stable ABI. + +## Effect on API resilience + +N/A + +## Future directions + +### Expanded options and modifiers + +The initial version of `Regex` includes only the options described above. Filling out the remainder of options described in the [Run-time Regex Construction proposal][literals] could be completed as future work, as well as additional improvements, such as adding an option that makes a regex match only at the start of a string. + +### Extensions to Character and Unicode Scalar APIs + +An earlier version of this pitch described adding standard library APIs to `Character` and `UnicodeScalar` for each of the supported character classes, as well as convenient static members for control characters. In addition, regex literals support Unicode property features that don’t currently exist in the standard library, such as a scalar’s script or extended category, or creating a scalar by its Unicode name instead of its scalar value. These kinds of additions are + +### Byte semantic mode + +A future `Regex` version could support a byte-level semantic mode in addition to grapheme cluster and Unicode scalar semantics. Byte-level semantics would allow matching individual bytes, potentially providing the capability of parsing string and non-string data together. + +### More general `CharacterSet` replacement + +Foundation's `CharacterSet` type is in some ways similar to the `CharacterClass` type defined in this proposal. `CharacterSet` is primarily a set type that is defined over Unicode scalars, and can therefore sometimes be awkward to use in conjunction with Swift `String`s. The proposed `CharacterClass` type is a `RegexBuilder`-specific type, and as such isn't intended to be a full general purpose replacement. Future work could involve expanding upon the `CharacterClass` API or introducing a different type to fill that role. + +## Alternatives considered + +### Operate on String.UnicodeScalarView instead of using semantic modes + +Instead of providing APIs to select whether `Regex` matching is `Character`-based vs. `UnicodeScalar`-based, we could instead provide methods to match against the different views of a string. This different approach has multiple drawbacks: + +* As the scalar level used when matching changes the behavior of individual components of a `Regex`, it’s more appropriate to specify the semantic level at the declaration site than the call site. +* With the proposed options model, you can define a Regex that includes different semantic levels for different portions of the match, which would be impossible with a call site-based approach. + +### Binary word boundary option method + +A prior version of this proposal used a binary method for setting the word boundary algorithm, called `usingSimpleWordBoundaries()`. A method taking a `RegexWordBoundaryKind` instance is included in the proposal instead, to leave room for implementing other word boundary algorithms in the future. + + +[repo]: https://github.com/apple/swift-experimental-string-processing/ +[option-scoping]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md#matching-options +[internals]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +[internals-properties]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md#character-properties +[internals-charclass]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md#custom-character-classes +[level1-word-boundaries]:https://unicode.org/reports/tr18/#Simple_Word_Boundaries +[level2-word-boundaries]:https://unicode.org/reports/tr18/#RL2.3 + +[overview]: https://forums.swift.org/t/declarative-string-processing-overview/52459 +[charprops]: https://github.com/apple/swift-evolution/blob/master/proposals/0221-character-properties.md +[charpropsrationale]: https://github.com/apple/swift-evolution/blob/master/proposals/0221-character-properties.md#detailed-semantics-and-rationale +[canoneq]: https://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence +[graphemes]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries +[meaningless]: https://forums.swift.org/t/declarative-string-processing-overview/52459/121 +[scalarprops]: https://github.com/apple/swift-evolution/blob/master/proposals/0211-unicode-scalar-properties.md +[ucd]: https://www.unicode.org/reports/tr44/tr44-28.html +[numerictype]: https://www.unicode.org/reports/tr44/#Numeric_Type +[derivednumeric]: https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedNumericType.txt + + +[uts18]: https://unicode.org/reports/tr18/ +[proplist]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +[pcre]: https://www.pcre.org/current/doc/html/pcre2pattern.html +[perl]: https://perldoc.perl.org/perlre +[raku]: https://docs.raku.org/language/regexes +[rust]: https://docs.rs/regex/1.5.4/regex/ +[python]: https://docs.python.org/3/library/re.html +[ruby]: https://ruby-doc.org/core-2.4.0/Regexp.html +[csharp]: https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expression-language-quick-reference +[icu]: https://unicode-org.github.io/icu/userguide/strings/regexp.html +[posix]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html +[oniguruma]: https://www.cuminas.jp/sdk/regularExpression.html +[go]: https://pkg.go.dev/regexp/syntax@go1.17.2 +[cplusplus]: https://www.cplusplus.com/reference/regex/ECMAScript/ +[ecmascript]: https://262.ecma-international.org/12.0/#sec-pattern-semantics +[re2]: https://github.com/google/re2/wiki/Syntax +[java]: https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index cc5afda39..1483996d1 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -445,6 +445,14 @@ class RegexDSLTests: XCTestCase { Repeat(2...) { "e" } Repeat(0...) { "f" } } + + let octoDecimalRegex: Regex<(Substring, Int?)> = Regex { + let charClass = CharacterClass(.digit, "a"..."h")//.ignoringCase() + Capture { + OneOrMore(charClass) + } transform: { Int($0, radix: 18) } + } + XCTAssertEqual("ab12".firstMatch(of: octoDecimalRegex)!.output.1, 61904) } func testAssertions() throws { From 81bc5d0979dcda578c87683b476996a59633e866 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 22 Apr 2022 10:50:17 -0600 Subject: [PATCH 17/18] Updates for algorithms proposal (#319) * Rename custom match prefix protocol and add doc comments * Update algo proposal prose --- Documentation/Evolution/ProposalOverview.md | 2 +- Documentation/Evolution/RegexTypeOverview.md | 10 ++--- .../Evolution/StringProcessingAlgorithms.md | 43 ++++++++++--------- .../Regex/CustomComponents.swift | 39 +++++++++++++++++ .../Regex/DSLConsumers.swift | 29 ------------- Tests/RegexBuilderTests/CustomTests.swift | 12 +++--- Tests/RegexBuilderTests/RegexDSLTests.swift | 4 +- 7 files changed, 75 insertions(+), 64 deletions(-) create mode 100644 Sources/_StringProcessing/Regex/CustomComponents.swift delete mode 100644 Sources/_StringProcessing/Regex/DSLConsumers.swift diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index f3c3ac6d1..898e0db20 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -39,7 +39,7 @@ Covers the "interior" syntax, extended syntaxes, run-time construction of a rege Proposes a slew of Regex-powered algorithms. -Introduces `CustomMatchingRegexComponent`, which is a monadic-parser style interface for external parsers to be used as components of a regex. +Introduces `CustomPrefixMatchRegexComponent`, which is a monadic-parser style interface for external parsers to be used as components of a regex. ## Unicode for String Processing diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md index 6eed648f0..68dd6ccc7 100644 --- a/Documentation/Evolution/RegexTypeOverview.md +++ b/Documentation/Evolution/RegexTypeOverview.md @@ -231,7 +231,7 @@ The result builder allows for inline failable value construction, which particip Swift regexes describe an unambiguous algorithm, where choice is ordered and effects can be reliably observed. For example, a `print()` statement inside the `TryCapture`'s transform function will run whenever the overall algorithm naturally dictates an attempt should be made. Optimizations can only elide such calls if they can prove it is behavior-preserving (e.g. "pure"). -`CustomMatchingRegexComponent`, discussed in [String Processing Algorithms][pitches], allows industrial-strength parsers to be used a regex components. This allows us to drop the overly-permissive pre-parsing step: +`CustomPrefixMatchRegexComponent`, discussed in [String Processing Algorithms][pitches], allows industrial-strength parsers to be used a regex components. This allows us to drop the overly-permissive pre-parsing step: ```swift func processEntry(_ line: String) -> Transaction? { @@ -431,7 +431,7 @@ Regular expressions have a deservedly mixed reputation, owing to their historica * "Regular expressions are bad because you should use a real parser" - In other systems, you're either in or you're out, leading to a gravitational pull to stay in when... you should get out - - Our remedy is interoperability with real parsers via `CustomMatchingRegexComponent` + - Our remedy is interoperability with real parsers via `CustomPrefixMatchRegexComponent` - Literals with refactoring actions provide an incremental off-ramp from regex syntax to result builders and real parsers * "Regular expressions are bad because ugly unmaintainable syntax" - We propose literals with source tools support, allowing for better syntax highlighting and analysis @@ -516,7 +516,7 @@ Regex are compiled into an intermediary representation and fairly simple analysi ### Future work: parser combinators -What we propose here is an incremental step towards better parsing support in Swift using parser-combinator style libraries. The underlying execution engine supports recursive function calls and mechanisms for library extensibility. `CustomMatchingRegexComponent`'s protocol requirement is effectively a [monadic parser](https://homepages.inf.ed.ac.uk/wadler/papers/marktoberdorf/baastad.pdf), meaning `Regex` provides a regex-flavored combinator-like system. +What we propose here is an incremental step towards better parsing support in Swift using parser-combinator style libraries. The underlying execution engine supports recursive function calls and mechanisms for library extensibility. `CustomPrefixMatchRegexComponent`'s protocol requirement is effectively a [monadic parser](https://homepages.inf.ed.ac.uk/wadler/papers/marktoberdorf/baastad.pdf), meaning `Regex` provides a regex-flavored combinator-like system. An issues with traditional parser combinator libraries are the compilation barriers between call-site and definition, resulting in excessive and overly-cautious backtracking traffic. These can be eliminated through better [compilation techniques](https://core.ac.uk/download/pdf/148008325.pdf). As mentioned above, Swift's support for custom static compilation is still under development. @@ -565,9 +565,9 @@ Regexes are often used for tokenization and tokens can be represented with Swift ### Future work: baked-in localized processing -- `CustomMatchingRegexComponent` gives an entry point for localized processors +- `CustomPrefixMatchRegexComponent` gives an entry point for localized processors - Future work includes (sub?)protocols to communicate localization intent --> -[pitches]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/ProposalOverview.md \ No newline at end of file +[pitches]: https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/ProposalOverview.md diff --git a/Documentation/Evolution/StringProcessingAlgorithms.md b/Documentation/Evolution/StringProcessingAlgorithms.md index 74416ae63..edefbd19b 100644 --- a/Documentation/Evolution/StringProcessingAlgorithms.md +++ b/Documentation/Evolution/StringProcessingAlgorithms.md @@ -8,9 +8,9 @@ We propose: 1. New regex-powered algorithms over strings, bringing the standard library up to parity with scripting languages 2. Generic `Collection` equivalents of these algorithms in terms of subsequences -3. `protocol CustomMatchingRegexComponent`, which allows 3rd party libraries to provide their industrial-strength parsers as intermixable components of regexes +3. `protocol CustomPrefixMatchRegexComponent`, which allows 3rd party libraries to provide their industrial-strength parsers as intermixable components of regexes -This proposal is part of a larger [regex-powered string processing initiative](https://forums.swift.org/t/declarative-string-processing-overview/52459). Throughout the document, we will reference the still-in-progress [`RegexProtocol`, `Regex`](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/StronglyTypedCaptures.md), and result builder DSL, but these are in flux and not formally part of this proposal. Further discussion of regex specifics is out of scope of this proposal and better discussed in another thread (see [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107) for links to relevant threads). +This proposal is part of a larger [regex-powered string processing initiative](https://github.com/apple/swift-evolution/blob/main/proposals/0350-regex-type-overview.md), the status of each proposal is tracked [here](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/ProposalOverview.md). Further discussion of regex specifics is out of scope of this proposal and better discussed in their relevant reviews. ## Motivation @@ -91,18 +91,18 @@ Note: Only a subset of Python's string processing API are included in this table ### Complex string processing -Even with the API additions, more complex string processing quickly becomes unwieldy. Up-coming support for authoring regexes in Swift help alleviate this somewhat, but string processing in the modern world involves dealing with localization, standards-conforming validation, and other concerns for which a dedicated parser is required. +Even with the API additions, more complex string processing quickly becomes unwieldy. String processing in the modern world involves dealing with localization, standards-conforming validation, and other concerns for which a dedicated parser is required. Consider parsing the date field `"Date: Wed, 16 Feb 2022 23:53:19 GMT"` in an HTTP header as a `Date` type. The naive approach is to search for a substring that looks like a date string (`16 Feb 2022`), and attempt to post-process it as a `Date` with a date parser: ```swift let regex = Regex { - capture { - oneOrMore(.digit) + Capture { + OneOrMore(.digit) " " - oneOrMore(.word) + OneOrMore(.word) " " - oneOrMore(.digit) + OneOrMore(.digit) } } @@ -128,21 +128,21 @@ DEBIT 03/24/2020 IRX tax payment ($52,249.98) Parsing a currency string such as `$3,020.85` with regex is also tricky, as it can contain localized and currency symbols in addition to accounting conventions. This is why Foundation provides industrial-strength parsers for localized strings. -## Proposed solution +## Proposed solution ### Complex string processing -We propose a `CustomMatchingRegexComponent` protocol which allows types from outside the standard library participate in regex builders and `RegexComponent` algorithms. This allows types, such as `Date.ParseStrategy` and `FloatingPointFormatStyle.Currency`, to be used directly within a regex: +We propose a `CustomPrefixMatchRegexComponent` protocol which allows types from outside the standard library participate in regex builders and `RegexComponent` algorithms. This allows types, such as `Date.ParseStrategy` and `FloatingPointFormatStyle.Currency`, to be used directly within a regex: ```swift let dateRegex = Regex { - capture(dateParser) + Capture(dateParser) } let date: Date = header.firstMatch(of: dateRegex).map(\.result.1) let currencyRegex = Regex { - capture(.localizedCurrency(code: "USD").sign(strategy: .accounting)) + Capture(.localizedCurrency(code: "USD").sign(strategy: .accounting)) } let amount: [Decimal] = statement.matches(of: currencyRegex).map(\.result.1) @@ -167,24 +167,25 @@ We also propose the following regex-powered algorithms as well as their generic |`matches(of:)`| Returns a collection containing all matches of the specified `RegexComponent` | -## Detailed design +## Detailed design -### `CustomMatchingRegexComponent` +### `CustomPrefixMatchRegexComponent` -`CustomMatchingRegexComponent` inherits from `RegexComponent` and satisfies its sole requirement; Conformers can be used with all of the string algorithms generic over `RegexComponent`. +`CustomPrefixMatchRegexComponent` inherits from `RegexComponent` and satisfies its sole requirement. Conformers can be used with all of the string algorithms generic over `RegexComponent`. ```swift -/// A protocol for custom match functionality. -public protocol CustomMatchingRegexComponent : RegexComponent { - /// Match the input string within the specified bounds, beginning at the given index, and return - /// the end position (upper bound) of the match and the matched instance. +/// A protocol allowing custom types to function as regex components by +/// providing the raw functionality backing `prefixMatch`. +public protocol CustomPrefixMatchRegexComponent: RegexComponent { + /// Process the input string within the specified bounds, beginning at the given index, and return + /// the end position (upper bound) of the match and the produced output. /// - Parameters: /// - input: The string in which the match is performed. /// - index: An index of `input` at which to begin matching. /// - bounds: The bounds in `input` in which the match is performed. /// - Returns: The upper bound where the match terminates and a matched instance, or `nil` if /// there isn't a match. - func match( + func consuming( _ input: String, startingAt index: String.Index, in bounds: Range @@ -198,8 +199,8 @@ public protocol CustomMatchingRegexComponent : RegexComponent { We use Foundation `FloatingPointFormatStyle.Currency` as an example for protocol conformance. It would implement the `match` function with `Match` being a `Decimal`. It could also add a static function `.localizedCurrency(code:)` as a member of `RegexComponent`, so it can be referred as `.localizedCurrency(code:)` in the `Regex` result builder: ```swift -extension FloatingPointFormatStyle.Currency : CustomMatchingRegexComponent { - public func match( +extension FloatingPointFormatStyle.Currency : CustomPrefixMatchRegexComponent { + public func consuming( _ input: String, startingAt index: String.Index, in bounds: Range diff --git a/Sources/_StringProcessing/Regex/CustomComponents.swift b/Sources/_StringProcessing/Regex/CustomComponents.swift new file mode 100644 index 000000000..e8111555c --- /dev/null +++ b/Sources/_StringProcessing/Regex/CustomComponents.swift @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@available(SwiftStdlib 5.7, *) +/// A protocol allowing custom types to function as regex components by +/// providing the raw functionality backing `prefixMatch`. +public protocol CustomPrefixMatchRegexComponent: RegexComponent { + /// Process the input string within the specified bounds, beginning at the given index, and return + /// the end position (upper bound) of the match and the produced output. + /// - Parameters: + /// - input: The string in which the match is performed. + /// - index: An index of `input` at which to begin matching. + /// - bounds: The bounds in `input` in which the match is performed. + /// - Returns: The upper bound where the match terminates and a matched instance, or `nil` if + /// there isn't a match. + func consuming( + _ input: String, + startingAt index: String.Index, + in bounds: Range + ) throws -> (upperBound: String.Index, output: RegexOutput)? +} + +@available(SwiftStdlib 5.7, *) +extension CustomPrefixMatchRegexComponent { + public var regex: Regex { + let node: DSLTree.Node = .matcher(RegexOutput.self, { input, index, bounds in + try consuming(input, startingAt: index, in: bounds) + }) + return Regex(node: node) + } +} diff --git a/Sources/_StringProcessing/Regex/DSLConsumers.swift b/Sources/_StringProcessing/Regex/DSLConsumers.swift deleted file mode 100644 index eb8ace8d3..000000000 --- a/Sources/_StringProcessing/Regex/DSLConsumers.swift +++ /dev/null @@ -1,29 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -@available(SwiftStdlib 5.7, *) -public protocol CustomMatchingRegexComponent: RegexComponent { - func match( - _ input: String, - startingAt index: String.Index, - in bounds: Range - ) throws -> (upperBound: String.Index, output: RegexOutput)? -} - -@available(SwiftStdlib 5.7, *) -extension CustomMatchingRegexComponent { - public var regex: Regex { - let node: DSLTree.Node = .matcher(RegexOutput.self, { input, index, bounds in - try match(input, startingAt: index, in: bounds) - }) - return Regex(node: node) - } -} diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index d17c3a142..269f9ebaa 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -14,13 +14,13 @@ import _StringProcessing @testable import RegexBuilder // A nibbler processes a single character from a string -private protocol Nibbler: CustomMatchingRegexComponent { +private protocol Nibbler: CustomPrefixMatchRegexComponent { func nibble(_: Character) -> RegexOutput? } extension Nibbler { // Default implementation, just feed the character in - func match( + func consuming( _ input: String, startingAt index: String.Index, in bounds: Range @@ -49,10 +49,10 @@ private struct Asciibbler: Nibbler { } } -private struct IntParser: CustomMatchingRegexComponent { +private struct IntParser: CustomPrefixMatchRegexComponent { struct ParseError: Error, Hashable {} typealias RegexOutput = Int - func match(_ input: String, + func consuming(_ input: String, startingAt index: String.Index, in bounds: Range ) throws -> (upperBound: String.Index, output: Int)? { @@ -71,7 +71,7 @@ private struct IntParser: CustomMatchingRegexComponent { } } -private struct CurrencyParser: CustomMatchingRegexComponent { +private struct CurrencyParser: CustomPrefixMatchRegexComponent { enum Currency: String, Hashable { case usd = "USD" case ntd = "NTD" @@ -84,7 +84,7 @@ private struct CurrencyParser: CustomMatchingRegexComponent { } typealias RegexOutput = Currency - func match(_ input: String, + func consuming(_ input: String, startingAt index: String.Index, in bounds: Range ) throws -> (upperBound: String.Index, output: Currency)? { diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1483996d1..cd1c94657 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -863,9 +863,9 @@ class RegexDSLTests: XCTestCase { var patch: Int var dev: String? } - struct SemanticVersionParser: CustomMatchingRegexComponent { + struct SemanticVersionParser: CustomPrefixMatchRegexComponent { typealias RegexOutput = SemanticVersion - func match( + func consuming( _ input: String, startingAt index: String.Index, in bounds: Range From 89b80bfe180ea3940ae9a35bb5765f69f448ec4e Mon Sep 17 00:00:00 2001 From: Rintaro Ishizaki Date: Mon, 11 Apr 2022 09:59:04 -0700 Subject: [PATCH 18/18] Preparation for location aware diagnostics in the compiler. Moving `libswiftLexRegexLiteral()` and `libswiftParseRegexLiteral()` to the compiler repository because these functions are basically just briding the compiler to the actual lexing/parsing function. * Make some Lexing error APIs public. * Make LocatedErrorProtocol public and expose the `location` property * Shift the location of `LocatedError` in `parseWithDelimiters` so the client can get the valid string indices of the passed-in literal string. --- .../Regex/Parse/DelimiterLexing.swift | 18 ++++++++++++------ .../Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_RegexParser/Regex/Parse/Mocking.swift | 3 +++ Sources/_RegexParser/Regex/Parse/Parse.swift | 12 +++++++++++- .../Regex/Parse/SourceLocation.swift | 11 +++++++++-- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index bee782043..dd142f016 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -9,7 +9,7 @@ // //===----------------------------------------------------------------------===// -struct Delimiter: Hashable { +public struct Delimiter: Hashable { let kind: Kind let poundCount: Int @@ -74,8 +74,8 @@ extension Delimiter { } } -struct DelimiterLexError: Error, CustomStringConvertible { - enum Kind: Hashable { +public struct DelimiterLexError: Error, CustomStringConvertible { + public enum Kind: Hashable { case unterminated case invalidUTF8 // TODO: better range reporting case unknownDelimiter @@ -83,17 +83,17 @@ struct DelimiterLexError: Error, CustomStringConvertible { case multilineClosingNotOnNewline } - var kind: Kind + public var kind: Kind /// The pointer at which to resume lexing. - var resumePtr: UnsafeRawPointer + public var resumePtr: UnsafeRawPointer init(_ kind: Kind, resumeAt resumePtr: UnsafeRawPointer) { self.kind = kind self.resumePtr = resumePtr } - var description: String { + public var description: String { switch kind { case .unterminated: return "unterminated regex literal" case .invalidUTF8: return "invalid UTF-8 found in source file" @@ -462,3 +462,9 @@ func lexRegex( var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters) return try lexer.lex() } + +public func lexRegex( + start: UnsafeRawPointer, end: UnsafeRawPointer +) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + return try lexRegex(start: start, end: end, delimiters: Delimiter.enabledDelimiters) +} diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 6a61ccdf7..9633b607e 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -24,7 +24,7 @@ API convention: extension Error { func addingLocation(_ loc: Range) -> Error { // If we're already a LocatedError, don't change the location. - if self is _LocatedErrorProtocol { + if self is LocatedErrorProtocol { return self } return Source.LocatedError(self, loc) diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift index dd02e0fc7..56294e2d3 100644 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +@available(*, deprecated, message: "moving to SwiftCompilerModules") private func copyCString(_ str: String) -> UnsafePointer { let count = str.utf8.count + 1 return str.withCString { @@ -36,6 +37,7 @@ private func copyCString(_ str: String) -> UnsafePointer { /// - Returns: A bool indicating whether lexing was completely erroneous, and /// cannot be recovered from, or false if there either was no error, /// or there was a recoverable error. +@available(*, deprecated, message: "moving to SwiftCompilerModules") func libswiftLexRegexLiteral( _ curPtrPtr: UnsafeMutablePointer?>?, _ bufferEndPtr: UnsafePointer?, @@ -93,6 +95,7 @@ public let currentRegexLiteralFormatVersion: CUnsignedInt = 1 /// capture structure. /// - captureStructureSize: The size of the capture structure buffer. Must be /// greater than or equal to `strlen(inputPtr)`. +@available(*, deprecated, message: "moving to SwiftCompilerModules") func libswiftParseRegexLiteral( _ inputPtr: UnsafePointer?, _ errOut: UnsafeMutablePointer?>?, diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 975012546..a2790924a 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -583,5 +583,15 @@ public func parseWithDelimiters( _ regex: S ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) - return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + do { + return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + } catch let error as LocatedErrorProtocol { + // Convert the range in 'contents' to the range in 'regex'. + let delimCount = delim.opening.count + let offsets = contents.offsets(of: error.location.range) + let startIndex = regex.index(atOffset: delimCount + offsets.lowerBound) + let endIndex = regex.index(atOffset: delimCount + offsets.upperBound) + + throw error._typeErasedError.addingLocation(startIndex..: Error, _LocatedErrorProtocol { + public struct LocatedError: Error, LocatedErrorProtocol { public let error: E public let location: SourceLocation @@ -118,4 +121,8 @@ extension Source.LocatedError: CustomStringConvertible { // we present the message to the compiler. "\(error)" } + + public var _typeErasedError: Error { + return error + } }