From 68d5acb0a305ff15a4bd25c89591531c1cd49a4a Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Fri, 6 Oct 2023 21:58:38 -0400
Subject: [PATCH] syntax/ast: add support for additional word boundary
 assertions

This adds AST support for the following new assertions:
\b{start}, \b{end}, \b{start-half}, \b{end-half}, \< and \>. The last
two, \< and \>, are aliases for \b{start} and \b{end}.

The parsing for this is a little suspect since there's a little
ambiguity between, e.g., \b{5} and \b{start}, but we handle it by
allowing the parser to look for one of the new special assertions, and
then back-up if it fails to find one so that it can try to parse a
counted repetition.

Ref #469
---
 regex-syntax/src/ast/mod.rs       |  47 +++++++
 regex-syntax/src/ast/parse.rs     | 226 ++++++++++++++++++++++++++++--
 regex-syntax/src/ast/print.rs     |   6 +
 regex-syntax/src/hir/translate.rs |  14 ++
 regex-syntax/src/lib.rs           |   3 +
 5 files changed, 281 insertions(+), 15 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index 9e0f92606..6a77ee134 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -162,6 +162,18 @@ pub enum ErrorKind {
     /// `(?i)*`. It is, however, possible to create a repetition operating on
     /// an empty sub-expression. For example, `()*` is still considered valid.
     RepetitionMissing,
+    /// The special word boundary syntax, `\b{something}`, was used, but
+    /// either EOF without `}` was seen, or an invalid character in the
+    /// braces was seen.
+    SpecialWordBoundaryUnclosed,
+    /// The special word boundary syntax, `\b{something}`, was used, but
+    /// `something` was not recognized as a valid word boundary kind.
+    SpecialWordBoundaryUnrecognized,
+    /// The syntax `\b{` was observed, but afterwards the end of the pattern
+    /// was observed without being able to tell whether it was meant to be a
+    /// bounded repetition on the `\b` or the beginning of a special word
+    /// boundary assertion.
+    SpecialWordOrRepetitionUnexpectedEof,
     /// The Unicode class is not valid. This typically occurs when a `\p` is
     /// followed by something other than a `{`.
     UnicodeClassInvalid,
@@ -260,6 +272,29 @@ impl core::fmt::Display for ErrorKind {
             RepetitionMissing => {
                 write!(f, "repetition operator missing expression")
             }
+            SpecialWordBoundaryUnclosed => {
+                write!(
+                    f,
+                    "special word boundary assertion is either \
+                     unclosed or contains an invalid character",
+                )
+            }
+            SpecialWordBoundaryUnrecognized => {
+                write!(
+                    f,
+                    "unrecognized special word boundary assertion, \
+                     valid choices are: start, end, start-half \
+                     or end-half",
+                )
+            }
+            SpecialWordOrRepetitionUnexpectedEof => {
+                write!(
+                    f,
+                    "found either the beginning of a special word \
+                     boundary or a bounded repetition on a \\b with \
+                     an opening brace, but no closing brace",
+                )
+            }
             UnicodeClassInvalid => {
                 write!(f, "invalid Unicode character class")
             }
@@ -1293,6 +1328,18 @@ pub enum AssertionKind {
     WordBoundary,
     /// `\B`
     NotWordBoundary,
+    /// `\b{start}`
+    WordBoundaryStart,
+    /// `\b{end}`
+    WordBoundaryEnd,
+    /// `\<` (alias for `\b{start}`)
+    WordBoundaryStartAngle,
+    /// `\>` (alias for `\b{end}`)
+    WordBoundaryEndAngle,
+    /// `\b{start-half}`
+    WordBoundaryStartHalf,
+    /// `\b{end-half}`
+    WordBoundaryEndHalf,
 }
 
 /// A repetition operation applied to a regular expression.
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index f7bae7759..593b14fbc 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -1528,18 +1528,115 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
                 span,
                 kind: ast::AssertionKind::EndText,
             })),
-            'b' => Ok(Primitive::Assertion(ast::Assertion {
-                span,
-                kind: ast::AssertionKind::WordBoundary,
-            })),
+            'b' => {
+                let mut wb = ast::Assertion {
+                    span,
+                    kind: ast::AssertionKind::WordBoundary,
+                };
+                // After a \b, we "try" to parse things like \b{start} for
+                // special word boundary assertions.
+                if !self.is_eof() && self.char() == '{' {
+                    if let Some(kind) =
+                        self.maybe_parse_special_word_boundary(start)?
+                    {
+                        wb.kind = kind;
+                        wb.span.end = self.pos();
+                    }
+                }
+                Ok(Primitive::Assertion(wb))
+            }
             'B' => Ok(Primitive::Assertion(ast::Assertion {
                 span,
                 kind: ast::AssertionKind::NotWordBoundary,
             })),
+            '<' => Ok(Primitive::Assertion(ast::Assertion {
+                span,
+                kind: ast::AssertionKind::WordBoundaryStartAngle,
+            })),
+            '>' => Ok(Primitive::Assertion(ast::Assertion {
+                span,
+                kind: ast::AssertionKind::WordBoundaryEndAngle,
+            })),
             _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
         }
     }
 
+    /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
+    /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
+    ///
+    /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
+    /// if it fails it will just return `None` with no error. This is done
+    /// because `\b{5}` is a valid expression and we want to let that be parsed
+    /// by the existing counted repetition parsing code. (I thought about just
+    /// invoking the counted repetition code from here, but it seemed a little
+    /// ham-fisted.)
+    ///
+    /// Unlike `maybe_parse_ascii_class` though, this can return an error.
+    /// Namely, if we definitely know it isn't a counted repetition, then we
+    /// return an error specific to the specialty word boundaries.
+    ///
+    /// This assumes the parser is positioned at a `{` immediately following
+    /// a `\b`. When `None` is returned, the parser is returned to the position
+    /// at which it started: pointing at a `{`.
+    ///
+    /// The position given should correspond to the start of the `\b`.
+    fn maybe_parse_special_word_boundary(
+        &self,
+        wb_start: Position,
+    ) -> Result<Option<ast::AssertionKind>> {
+        assert_eq!(self.char(), '{');
+
+        let is_valid_char = |c| match c {
+            'A'..='Z' | 'a'..='z' | '-' => true,
+            _ => false,
+        };
+        let start = self.pos();
+        if !self.bump_and_bump_space() {
+            return Err(self.error(
+                Span::new(wb_start, self.pos()),
+                ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+            ));
+        }
+        let start_contents = self.pos();
+        // This is one of the critical bits: if the first non-whitespace
+        // character isn't in [-A-Za-z] (i.e., this can't be a special word
+        // boundary), then we bail and let the counted repetition parser deal
+        // with this.
+        if !is_valid_char(self.char()) {
+            self.parser().pos.set(start);
+            return Ok(None);
+        }
+
+        // Now collect up our chars until we see a '}'.
+        let mut scratch = self.parser().scratch.borrow_mut();
+        scratch.clear();
+        while !self.is_eof() && is_valid_char(self.char()) {
+            scratch.push(self.char());
+            self.bump_and_bump_space();
+        }
+        if self.is_eof() || self.char() != '}' {
+            return Err(self.error(
+                Span::new(start, self.pos()),
+                ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            ));
+        }
+        let end = self.pos();
+        self.bump();
+        let kind = match scratch.as_str() {
+            "start" => ast::AssertionKind::WordBoundaryStart,
+            "end" => ast::AssertionKind::WordBoundaryEnd,
+            "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
+            "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
+            _ => {
+                return Err(self.error(
+                    Span::new(start_contents, end),
+                    ast::ErrorKind::SpecialWordBoundaryUnrecognized,
+                ))
+            }
+        };
+        Ok(Some(kind))
+    }
+
     /// Parse an octal representation of a Unicode codepoint up to 3 digits
     /// long. This expects the parser to be positioned at the first octal
     /// digit and advances the parser to the first character immediately
@@ -1967,9 +2064,9 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
         // because parsing cannot fail with any interesting error. For example,
         // in order to use an ASCII character class, it must be enclosed in
         // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
-        // of it as "ASCII character characters have the syntax `[:NAME:]`
-        // which can only appear within character brackets." This means that
-        // things like `[[:lower:]A]` are legal constructs.
+        // of it as "ASCII character classes have the syntax `[:NAME:]` which
+        // can only appear within character brackets." This means that things
+        // like `[[:lower:]A]` are legal constructs.
         //
         // However, if one types an incorrect ASCII character class, e.g.,
         // `[[:loower:]]`, then we treat that as a normal nested character
@@ -3295,6 +3392,23 @@ bar
                 ast: Box::new(lit('a', 0)),
             }))
         );
+        assert_eq!(
+            parser(r"\b{5,9}").parse(),
+            Ok(Ast::repetition(ast::Repetition {
+                span: span(0..7),
+                op: ast::RepetitionOp {
+                    span: span(2..7),
+                    kind: ast::RepetitionKind::Range(
+                        ast::RepetitionRange::Bounded(5, 9)
+                    ),
+                },
+                greedy: true,
+                ast: Box::new(Ast::assertion(ast::Assertion {
+                    span: span(0..2),
+                    kind: ast::AssertionKind::WordBoundary,
+                })),
+            }))
+        );
 
         assert_eq!(
             parser(r"(?i){0}").parse().unwrap_err(),
@@ -4381,6 +4495,48 @@ bar
                 kind: ast::AssertionKind::WordBoundary,
             }))
         );
+        assert_eq!(
+            parser(r"\b{start}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..9),
+                kind: ast::AssertionKind::WordBoundaryStart,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{end}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..7),
+                kind: ast::AssertionKind::WordBoundaryEnd,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{start-half}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..14),
+                kind: ast::AssertionKind::WordBoundaryStartHalf,
+            }))
+        );
+        assert_eq!(
+            parser(r"\b{end-half}").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..12),
+                kind: ast::AssertionKind::WordBoundaryEndHalf,
+            }))
+        );
+        assert_eq!(
+            parser(r"\<").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..2),
+                kind: ast::AssertionKind::WordBoundaryStartAngle,
+            }))
+        );
+        assert_eq!(
+            parser(r"\>").parse_primitive(),
+            Ok(Primitive::Assertion(ast::Assertion {
+                span: span(0..2),
+                kind: ast::AssertionKind::WordBoundaryEndAngle,
+            }))
+        );
         assert_eq!(
             parser(r"\B").parse_primitive(),
             Ok(Primitive::Assertion(ast::Assertion {
@@ -4418,20 +4574,60 @@ bar
                 kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
-        // But also, < and > are banned, so that we may evolve them into
-        // start/end word boundary assertions. (Not sure if we will...)
+
+        // Starting a special word boundary without any non-whitespace chars
+        // after the brace makes it ambiguous whether the user meant to write
+        // a counted repetition (probably not?) or an actual special word
+        // boundary assertion.
         assert_eq!(
-            parser(r"\<").parse_escape().unwrap_err(),
+            parser(r"\b{").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
+                span: span(0..3),
+                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
             }
         );
         assert_eq!(
-            parser(r"\>").parse_escape().unwrap_err(),
+            parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
+                span: span(0..4),
+                kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
+            }
+        );
+        // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
+        // and thus causes the parser to treat it as a counted repetition.
+        assert_eq!(
+            parser(r"\b{ ").parse().unwrap_err(),
+            TestError {
+                span: span(4..4),
+                kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
+            }
+        );
+        // In this case, we got some valid chars that makes it look like the
+        // user is writing one of the special word boundary assertions, but
+        // we forget to close the brace.
+        assert_eq!(
+            parser(r"\b{foo").parse_escape().unwrap_err(),
+            TestError {
+                span: span(2..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            }
+        );
+        // We get the same error as above, except it is provoked by seeing a
+        // char that we know is invalid before seeing a closing brace.
+        assert_eq!(
+            parser(r"\b{foo!}").parse_escape().unwrap_err(),
+            TestError {
+                span: span(2..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
+            }
+        );
+        // And this one occurs when, syntactically, everything looks okay, but
+        // we don't use a valid spelling of a word boundary assertion.
+        assert_eq!(
+            parser(r"\b{foo}").parse_escape().unwrap_err(),
+            TestError {
+                span: span(3..6),
+                kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
             }
         );
 
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 7dedf7f48..1ceb3c7fa 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -261,6 +261,12 @@ impl<W: fmt::Write> Writer<W> {
             EndText => self.wtr.write_str(r"\z"),
             WordBoundary => self.wtr.write_str(r"\b"),
             NotWordBoundary => self.wtr.write_str(r"\B"),
+            WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
+            WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
+            WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
+            WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
+            WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
+            WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
         }
     }
 
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index 56d261aa1..4ae279f92 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -962,6 +962,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             } else {
                 hir::Look::WordAsciiNegate
             }),
+            ast::AssertionKind::WordBoundaryStart
+            | ast::AssertionKind::WordBoundaryStartAngle => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryEnd
+            | ast::AssertionKind::WordBoundaryEndAngle => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryStartHalf => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
+            ast::AssertionKind::WordBoundaryEndHalf => {
+                Hir::look(if unicode { todo!() } else { todo!() })
+            }
         })
     }
 
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index a552099c6..38c8d88d4 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -334,6 +334,9 @@ pub fn is_escapeable_character(c: char) -> bool {
         // escapeable, \< and \> will result in a parse error. Thus, we can
         // turn them into something else in the future without it being a
         // backwards incompatible change.
+        //
+        // OK, now we support \< and \>, and we need to retain them as *not*
+        // escapeable here since the escape sequence is significant.
         '<' | '>' => false,
         _ => true,
     }