From 0454f1353cd464df1ee11e943deb8833cb42dc77 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Tue, 12 Apr 2022 17:43:51 +0100
Subject: [PATCH] Allow custom character classes to begin with `:`

ICU and Oniguruma allow custom character classes to
begin with `:`, and only lex a POSIX character
property if they detect a closing `:]`. However
their behavior for this differs:

- ICU will consider *any* `:]` in the regex as a
closing delimiter, even e.g `[[:a]][:]`.

- Oniguruma will stop if it hits a `]`, so
`[[:a]][:]` is treated as a custom character class.
However it only scans ahead 20 chars max, and doesn't
stop for e.g a nested character class opening `[`.

Our detection behavior for this is as follows:

- When `[:` is encountered inside a custom character
class, scan ahead to the closing `:]`.
- While scanning, bail if we see any characters
that are obviously invalid property names. Currently
this includes `[`, `]`, `}`, as well as a second
occurrence of `=`.
- Otherwise, if we end on `:]`, consider that a
POSIX character property.

We could include more metacharacters to bail on,
e.g `{`, `(`, `)`, but for now I'm tending on the
side of lexing an invalid POSIX character property.
We can always relax this in the future (as we'd be
turning invalid code into valid code). Users can
always escape the initial `:` in `[:` if they want
a custom character class. In fact, we may want to
suggest this via a warning, as this behavior can
be pretty subtle.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 121 +++++++++++++-----
 Sources/_RegexParser/Regex/Parse/Parse.swift  |   4 +-
 Tests/RegexTests/ParseTests.swift             |  39 +++++-
 3 files changed, 129 insertions(+), 35 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index c48d53de9..bf53d0ab1 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1064,11 +1064,16 @@ extension Source {
   }
 
   mutating func lexCustomCCStart(
+    context: ParsingContext
   ) throws -> Located<CustomCC.Start>? {
     recordLoc { src in
-      // POSIX named sets are atoms.
-      guard !src.starts(with: "[:") else { return nil }
-
+      // Make sure we don't have a POSIX character property. This may require
+      // walking to its ending to make sure we have a closing ':]', as otherwise
+      // we have a custom character class.
+      // TODO: This behavior seems subtle, could we warn?
+      guard !src.canLexPOSIXCharacterProperty(context: context) else {
+        return nil
+      }
       if src.tryEat("[") {
         return src.tryEat("^") ? .inverted : .normal
       }
@@ -1099,12 +1104,38 @@ extension Source {
   }
 
   private mutating func lexPOSIXCharacterProperty(
+    context: ParsingContext
   ) throws -> Located<AST.Atom.CharacterProperty>? {
-    try recordLoc { src in
-      guard src.tryEat(sequence: "[:") else { return nil }
-      let inverted = src.tryEat("^")
-      let prop = try src.lexCharacterPropertyContents(end: ":]").value
-      return .init(prop, isInverted: inverted, isPOSIX: true)
+    // Only allowed in a custom character class.
+    guard context.isInCustomCharacterClass else { return nil }
+    return try recordLoc { src in
+      try src.tryEating { src in
+        guard src.tryEat(sequence: "[:") else { return nil }
+        let inverted = src.tryEat("^")
+
+        // Note we lex the contents and ending *before* classifying, because we
+        // want to bail with nil if we don't have the right ending. This allows
+        // the lexing of a custom character class if we don't have a ':]'
+        // ending.
+        let (key, value) = src.lexCharacterPropertyKeyValue()
+        guard src.tryEat(sequence: ":]") else { return nil }
+
+        let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                                value: value)
+        return .init(prop, isInverted: inverted, isPOSIX: true)
+      }
+    }
+  }
+
+  private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
+    do {
+      var src = self
+      return try src.lexPOSIXCharacterProperty(context: context) != nil
+    } catch {
+      // We want to tend on the side of lexing a POSIX character property, so
+      // even if it is invalid in some way (e.g invalid property names), still
+      // try and lex it.
+      return true
     }
   }
 
@@ -1129,26 +1160,52 @@ extension Source {
     }
   }
 
-  private mutating func lexCharacterPropertyContents(
-    end: String
-  ) throws -> Located<AST.Atom.CharacterProperty.Kind> {
-    try recordLoc { src in
-      // We should either have:
-      // - 'x=y' where 'x' is a property key, and 'y' is a value.
-      // - 'y' where 'y' is a value (or a bool key with an inferred value
-      //   of true), and its key is inferred.
-      // TODO: We could have better recovery here if we only ate the characters
-      // that property keys and values can use.
-      let lhs = src.lexUntil {
-        $0.isEmpty || $0.peek() == "=" || $0.starts(with: end)
-      }.value
-      if src.tryEat("=") {
-        let rhs = try src.lexUntil(eating: end).value
-        return try Source.classifyCharacterProperty(key: lhs, value: rhs)
+  private mutating func lexCharacterPropertyKeyValue(
+  ) -> (key: String?, value: String) {
+    func atPossibleEnding(_ src: inout Source) -> Bool {
+      guard let next = src.peek() else { return true }
+      switch next {
+      case "=":
+        // End of a key.
+        return true
+      case ":", "[", "]":
+        // POSIX character property endings to cover ':]', ']', and '[' as the
+        // start of a nested character class.
+        return true
+      case "}":
+        // Ending of '\p{'. We cover this for POSIX too as it's not a valid
+        // character property name anyway, and it's nice not to have diverging
+        // logic for these cases.
+        return true
+      default:
+        // We may want to handle other metacharacters here, e.g '{', '(', ')',
+        // as they're not valid character property names. However for now
+        // let's tend on the side of forming an unknown property name in case
+        // these characters are ever used in future character property names
+        // (though it's very unlikely). Users can always escape e.g the ':'
+        // in '[:' if they definitely want a custom character class.
+        return false
       }
-      try src.expect(sequence: end)
-      return try Source.classifyCharacterPropertyValueOnly(lhs)
     }
+    // We should either have:
+    // - 'x=y' where 'x' is a property key, and 'y' is a value.
+    // - 'y' where 'y' is a value (or a bool key with an inferred value of true)
+    //   and its key is inferred.
+    let lhs = lexUntil(atPossibleEnding).value
+    if tryEat("=") {
+      let rhs = lexUntil(atPossibleEnding).value
+      return (lhs, rhs)
+    }
+    return (nil, lhs)
+  }
+
+  private static func classifyCharacterPropertyContents(
+    key: String?, value: String
+  ) throws -> AST.Atom.CharacterProperty.Kind {
+    if let key = key {
+      return try classifyCharacterProperty(key: key, value: value)
+    }
+    return try classifyCharacterPropertyValueOnly(value)
   }
 
   /// Try to consume a character property.
@@ -1164,7 +1221,10 @@ extension Source {
       let isInverted = src.peek() == "P"
       src.advance(2)
 
-      let prop = try src.lexCharacterPropertyContents(end: "}").value
+      let (key, value) = src.lexCharacterPropertyKeyValue()
+      let prop = try Source.classifyCharacterPropertyContents(key: key,
+                                                              value: value)
+      try src.expect("}")
       return .init(prop, isInverted: isInverted, isPOSIX: false)
     }
   }
@@ -1758,11 +1818,8 @@ extension Source {
       if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
       // TODO: Store customCC in the atom, if that's useful
 
-      // POSIX character property. This is only allowed in a custom character
-      // class.
-      // TODO: Can we try and recover and diagnose these outside character
-      // classes?
-      if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value {
+      // POSIX character property.
+      if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
         return .property(prop)
       }
 
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index c3aa3500b..d70a1f14f 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -403,7 +403,7 @@ extension Parser {
     }
 
     // Check if we have the start of a custom character class '['.
-    if let cccStart = try source.lexCustomCCStart() {
+    if let cccStart = try source.lexCustomCCStart(context: context) {
       return .customCharacterClass(
         try parseCustomCharacterClass(cccStart))
     }
@@ -487,7 +487,7 @@ extension Parser {
     while source.peek() != "]" && source.peekCCBinOp() == nil {
 
       // Nested custom character class.
-      if let cccStart = try source.lexCustomCCStart() {
+      if let cccStart = try source.lexCustomCCStart(context: context) {
         members.append(.custom(try parseCustomCharacterClass(cccStart)))
         continue
       }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 5e3defb11..aa0ebbf98 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -474,6 +474,26 @@ extension RegexTests {
     parseTest(
       "[a^]", charClass("a", "^"))
 
+    // These are custom character classes, not invalid POSIX character classes.
+    // TODO: This behavior is subtle, we ought to warn.
+    parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
+    parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
+    parseTest("[:a]", charClass(":", "a"))
+    parseTest("[a:]", charClass("a", ":"))
+    parseTest("[:]", charClass(":"))
+    parseTest("[::]", charClass(":", ":"))
+    parseTest("[:=:]", charClass(":", "=", ":"))
+    parseTest("[[:]]", charClass(charClass(":")))
+    parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
+
+    parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":")))
+    parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":")))
+    parseTest(#"[[:a]]"#, charClass(charClass(":", "a")))
+    parseTest(#"[[:}]]"#, charClass(charClass(":", "}")))
+    parseTest(#"[[:{]]"#, charClass(charClass(":", "{")))
+    parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{"))))
+    parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":")))
+
     parseTest(
       #"\D\S\W"#,
       concat(
@@ -1096,9 +1116,13 @@ extension RegexTests {
       #"\p{C}+"#,
       oneOrMore(of: prop(.generalCategory(.other))))
 
+    // TODO: Start erroring on these?
     parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx")))
     parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL")))
     parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y")))
+    parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)")))
+    parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()"))))
+    parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}"))
 
     // UAX44-LM3 means all of the below are equivalent.
     let lowercaseLetter = prop(.generalCategory(.lowercaseLetter))
@@ -2183,7 +2207,11 @@ extension RegexTests {
     diagnosticTest(#"\N{A"#, .expected("}"))
     diagnosticTest(#"\N{U+A"#, .expected("}"))
     diagnosticTest(#"\p{a"#, .expected("}"))
-    diagnosticTest(#"\p{a="#, .expected("}"))
+    diagnosticTest(#"\p{a="#, .emptyProperty)
+    diagnosticTest(#"\p{a=}"#, .emptyProperty)
+    diagnosticTest(#"\p{a=b"#, .expected("}"))
+    diagnosticTest(#"\p{aaa[b]}"#, .expected("}"))
+    diagnosticTest(#"\p{a=b=c}"#, .expected("}"))
     diagnosticTest(#"(?#"#, .expected(")"))
     diagnosticTest(#"(?x"#, .expected(")"))
 
@@ -2218,6 +2246,15 @@ extension RegexTests {
     // the closing bracket.
     diagnosticTest("[]", .expected("]"))
 
+    diagnosticTest("[:a", .expected("]"))
+    diagnosticTest("[:a:", .expected("]"))
+    diagnosticTest("[[:a", .expected("]"))
+    diagnosticTest("[[:a:", .expected("]"))
+    diagnosticTest("[[:a[:]", .expected("]"))
+
+    diagnosticTest("[[::]]", .emptyProperty)
+    diagnosticTest("[[:=:]]", .emptyProperty)
+
     // MARK: Bad escapes
 
     diagnosticTest("\\", .expectedEscape)