Skip to content

[swift/main] Adds SPI for a NSRE compatibility mode option (#698) #700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Sources/_RegexParser/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ extension AST {

// Swift-only default possessive quantifier
case possessiveByDefault // t.b.d.

// NSRegularExpression compatibility special-case
case nsreCompatibleDot // no AST representation
}

public var kind: Kind
Expand Down
3 changes: 2 additions & 1 deletion Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ extension RegexValidator {

case .caseInsensitive, .possessiveByDefault, .reluctantByDefault,
.singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended,
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps:
.asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps,
.nsreCompatibleDot:
break
}
}
Expand Down
14 changes: 10 additions & 4 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ fileprivate extension Compiler.ByteCodeGen {
emitAnyNonNewline()

case .dot:
emitDot()
try emitDot()

case let .char(c):
emitCharacter(c)
Expand Down Expand Up @@ -238,9 +238,15 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitDot() {
mutating func emitDot() throws {
if options.dotMatchesNewline {
emitAny()
if options.usesNSRECompatibleDot {
try emitAlternation([
.atom(.characterClass(.newlineSequence)),
.atom(.anyNonNewline)])
} else {
emitAny()
}
} else {
emitAnyNonNewline()
}
Expand Down Expand Up @@ -964,7 +970,7 @@ fileprivate extension Compiler.ByteCodeGen {
case let .customCharacterClass(ccc):
if ccc.containsDot {
if !ccc.isInverted {
emitDot()
try emitDot()
} else {
throw Unsupported("Inverted any")
}
Expand Down
7 changes: 7 additions & 0 deletions Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ extension MatchingOptions {
? .graphemeCluster
: .unicodeScalar
}

var usesNSRECompatibleDot: Bool {
stack.last!.contains(.nsreCompatibleDot)
}
}

// MARK: - Implementation
Expand All @@ -141,6 +145,7 @@ extension MatchingOptions {
// Not available via regex literal flags
case transparentBounds
case withoutAnchoringBounds
case nsreCompatibleDot

// Oniguruma options
case asciiOnlyDigit
Expand Down Expand Up @@ -197,6 +202,8 @@ extension MatchingOptions {
self = .byteSemantics
case .possessiveByDefault:
self = .possessiveByDefault
case .nsreCompatibleDot:
self = .nsreCompatibleDot

// Whitespace options are only relevant during parsing, not compilation.
case .extended, .extraExtended:
Expand Down
12 changes: 12 additions & 0 deletions Sources/_StringProcessing/Regex/Options.swift
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,18 @@ extension Regex {
return wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}

/// Returns a regular expression that uses an NSRegularExpression
/// compatibility mode.
///
/// This mode includes using Unicode scalar semantics and treating a `dot`
/// as matching newline sequences (when in the unrelated dot-matches-newlines
/// mode).
@_spi(Foundation)
public var _nsreCompatibility: Regex<RegexOutput> {
wrapInOption(.nsreCompatibleDot, addingIf: true)
.wrapInOption(.unicodeScalarSemantics, addingIf: true)
}
}

/// A semantic level to use during regex matching.
Expand Down
38 changes: 37 additions & 1 deletion Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import XCTest
@testable import _RegexParser
@testable @_spi(RegexBenchmark) import _StringProcessing
@testable @_spi(RegexBenchmark) @_spi(Foundation) import _StringProcessing
import TestSupport

struct MatchError: Error {
Expand Down Expand Up @@ -2726,4 +2726,40 @@ extension RegexTests {
XCTAssertNotNil(str.wholeMatch(of: possessiveRegex))
}
}

func testNSRECompatibility() throws {
// NSRE-compatibility includes scalar matching, so `[\r\n]` should match
// either `\r` or `\n`.
let text = #"""
y=sin(x)+sin(2x)+sin(3x);\#rText "This is a function of x.";\r
"""#
let lineTerminationRegex = try Regex(#";[\r\n]"#)
._nsreCompatibility

let afterLine = try XCTUnwrap(text.firstRange(of: "Text"))
let match = try lineTerminationRegex.firstMatch(in: text)
XCTAssert(match?.range.upperBound == afterLine.lowerBound)

// NSRE-compatibility treats "dot" as special, in that it can match a
// newline sequence as well as a single Unicode scalar.
let aDotBRegex = try Regex(#"a.b"#)
._nsreCompatibility
.dotMatchesNewlines()
for input in ["a\rb", "a\nb", "a\r\nb"] {
XCTAssertNotNil(try aDotBRegex.wholeMatch(in: input))
}

// NSRE-compatibility doesn't give special treatment to newline sequences
// when matching other "match everything" regex patterns, like `[[^z]z]`,
// so this pattern doesn't match "a\r\nb".
let aCCBRegex = try Regex(#"a[[^z]z]b"#)
._nsreCompatibility
for input in ["a\rb", "a\nb", "a\r\nb"] {
if input.unicodeScalars.count == 3 {
XCTAssertNotNil(try aCCBRegex.wholeMatch(in: input))
} else {
XCTAssertNil(try aCCBRegex.wholeMatch(in: input))
}
}
}
}