syntax: permit most no-op escape sequences

BurntSushi · BurntSushi · commit fe8d6673d3ea · 2023-04-17T15:42:25.000-04:00
This resolves a long-standing (but somewhat minor) complaint that folks have with the regex crate: it does not permit escaping punctuation characters in cases where those characters do not need to be escaped. So things like \/, \" and \! would result in parse errors. Most other regex engines permit these, even in cases where they aren't needed. I had been against doing this for future evolution purposes, but it's incredibly unlikely that we're ever going to add a new meta character to the syntax. I literally cannot think of any conceivable future in which that might happen. However, we do continue to ban escapes for [0-9A-Za-z<>], because it is conceivable that we might add new escape sequences for those characters. (And 0-9 are already banned by virtue of them looking too much like backreferences, which aren't supported.) For example, we could add \Q...\E literal syntax. Or \< and \> as start and end word boundaries, as found in POSIX regex engines. Fixes #501
diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
@@ -588,9 +588,12 @@ impl Literal {
 pub enum LiteralKind {
     /// The literal is written verbatim, e.g., `a` or `☃`.
     Verbatim,
-    /// The literal is written as an escape because it is punctuation, e.g.,
-    /// `\*` or `\[`.
-    Punctuation,
+    /// The literal is written as an escape because it is otherwise a special
+    /// regex meta character, e.g., `\*` or `\[`.
+    Meta,
+    /// The literal is written as an escape despite the fact that the escape is
+    /// unnecessary, e.g., `\%` or `\/`.
+    Superfluous,
     /// The literal is written as an octal escape, e.g., `\141`.
     Octal,
     /// The literal is written as a hex code with a fixed number of digits
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
@@ -18,7 +18,7 @@ use alloc::{
 use crate::{
     ast::{self, Ast, Position, Span},
     either::Either,
-    is_meta_character,
+    is_escapeable_character, is_meta_character,
 };
 
 type Result<T> = core::result::Result<T, ast::Error>;
@@ -1495,7 +1495,14 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
         if is_meta_character(c) {
             return Ok(Primitive::Literal(ast::Literal {
                 span,
-                kind: ast::LiteralKind::Punctuation,
+                kind: ast::LiteralKind::Meta,
+                c,
+            }));
+        }
+        if is_escapeable_character(c) {
+            return Ok(Primitive::Literal(ast::Literal {
+                span,
+                kind: ast::LiteralKind::Superfluous,
                 c,
             }));
         }
@@ -1513,9 +1520,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
             'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
             'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
             'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
-            ' ' if self.ignore_whitespace() => {
-                special(ast::SpecialLiteralKind::Space, ' ')
-            }
             'A' => Ok(Primitive::Assertion(ast::Assertion {
                 span,
                 kind: ast::AssertionKind::StartText,
@@ -2420,13 +2424,9 @@ mod tests {
         lit_with(c, span(start..start + c.len_utf8()))
     }
 
-    /// Create a punctuation literal starting at the given position.
-    fn punct_lit(c: char, span: Span) -> Ast {
-        Ast::Literal(ast::Literal {
-            span,
-            kind: ast::LiteralKind::Punctuation,
-            c,
-        })
+    /// Create a meta literal starting at the given position.
+    fn meta_lit(c: char, span: Span) -> Ast {
+        Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
     }
 
     /// Create a verbatim literal with the given span.
@@ -2710,24 +2710,24 @@ bar
             Ok(concat(
                 0..36,
                 vec![
-                    punct_lit('\\', span(0..2)),
-                    punct_lit('.', span(2..4)),
-                    punct_lit('+', span(4..6)),
-                    punct_lit('*', span(6..8)),
-                    punct_lit('?', span(8..10)),
-                    punct_lit('(', span(10..12)),
-                    punct_lit(')', span(12..14)),
-                    punct_lit('|', span(14..16)),
-                    punct_lit('[', span(16..18)),
-                    punct_lit(']', span(18..20)),
-                    punct_lit('{', span(20..22)),
-                    punct_lit('}', span(22..24)),
-                    punct_lit('^', span(24..26)),
-                    punct_lit('$', span(26..28)),
-                    punct_lit('#', span(28..30)),
-                    punct_lit('&', span(30..32)),
-                    punct_lit('-', span(32..34)),
-                    punct_lit('~', span(34..36)),
+                    meta_lit('\\', span(0..2)),
+                    meta_lit('.', span(2..4)),
+                    meta_lit('+', span(4..6)),
+                    meta_lit('*', span(6..8)),
+                    meta_lit('?', span(8..10)),
+                    meta_lit('(', span(10..12)),
+                    meta_lit(')', span(12..14)),
+                    meta_lit('|', span(14..16)),
+                    meta_lit('[', span(16..18)),
+                    meta_lit(']', span(18..20)),
+                    meta_lit('{', span(20..22)),
+                    meta_lit('}', span(22..24)),
+                    meta_lit('^', span(24..26)),
+                    meta_lit('$', span(26..28)),
+                    meta_lit('#', span(28..30)),
+                    meta_lit('&', span(30..32)),
+                    meta_lit('-', span(32..34)),
+                    meta_lit('~', span(34..36)),
                 ]
             ))
         );
@@ -2879,23 +2879,12 @@ bar
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
                     Ast::Literal(ast::Literal {
                         span: span_range(pat, 4..6),
-                        kind: ast::LiteralKind::Special(
-                            ast::SpecialLiteralKind::Space
-                        ),
+                        kind: ast::LiteralKind::Superfluous,
                         c: ' ',
                     }),
                 ]
             ))
         );
-        // ... but only when `x` mode is enabled.
-        let pat = r"\ ";
-        assert_eq!(
-            parser(pat).parse().unwrap_err(),
-            TestError {
-                span: span_range(pat, 0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
-            }
-        );
     }
 
     #[test]
@@ -4246,7 +4235,7 @@ bar
             parser(r"\|").parse_primitive(),
             Ok(Primitive::Literal(ast::Literal {
                 span: span(0..2),
-                kind: ast::LiteralKind::Punctuation,
+                kind: ast::LiteralKind::Meta,
                 c: '|',
             }))
         );
@@ -4297,11 +4286,26 @@ bar
             }))
         );
 
+        // We also support superfluous escapes in most cases now too.
+        for c in ['!', '@', '%', '"', '\'', '/', ' '] {
+            let pat = format!(r"\{}", c);
+            assert_eq!(
+                parser(&pat).parse_primitive(),
+                Ok(Primitive::Literal(ast::Literal {
+                    span: span(0..2),
+                    kind: ast::LiteralKind::Superfluous,
+                    c,
+                }))
+            );
+        }
+
+        // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
+        // gives flexibility for future evolution.
         assert_eq!(
-            parser(r"\").parse_escape().unwrap_err(),
+            parser(r"\e").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..1),
-                kind: ast::ErrorKind::EscapeUnexpectedEof,
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
         assert_eq!(
@@ -4311,6 +4315,31 @@ bar
                 kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
+        // But also, < and > are banned, so that we may evolve them into
+        // start/end word boundary assertions. (Not sure if we will...)
+        assert_eq!(
+            parser(r"\<").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
+            }
+        );
+        assert_eq!(
+            parser(r"\>").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
+            }
+        );
+
+        // An unfinished escape is illegal.
+        assert_eq!(
+            parser(r"\").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..1),
+                kind: ast::ErrorKind::EscapeUnexpectedEof,
+            }
+        );
     }
 
     #[test]
@@ -4907,7 +4936,7 @@ bar
                         lit(span(1..2), 'a'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: ']',
                         }),
                     ]
@@ -4925,7 +4954,7 @@ bar
                         lit(span(1..2), 'a'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: '-',
                         }),
                         lit(span(4..5), 'z'),
@@ -5117,7 +5146,7 @@ bar
                     span(1..6),
                     itemset(ast::ClassSetItem::Literal(ast::Literal {
                         span: span(1..3),
-                        kind: ast::LiteralKind::Punctuation,
+                        kind: ast::LiteralKind::Meta,
                         c: '^',
                     })),
                     itemset(lit(span(5..6), '^')),
@@ -5133,7 +5162,7 @@ bar
                     span(1..6),
                     itemset(ast::ClassSetItem::Literal(ast::Literal {
                         span: span(1..3),
-                        kind: ast::LiteralKind::Punctuation,
+                        kind: ast::LiteralKind::Meta,
                         c: '&',
                     })),
                     itemset(lit(span(5..6), '&')),
@@ -5198,7 +5227,7 @@ bar
                         lit(span(1..2), ']'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: '[',
                         }),
                     ]
@@ -5216,7 +5245,7 @@ bar
                         kind: itemset(ast::ClassSetItem::Literal(
                             ast::Literal {
                                 span: span(1..3),
-                                kind: ast::LiteralKind::Punctuation,
+                                kind: ast::LiteralKind::Meta,
                                 c: '[',
                             }
                         )),
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
@@ -216,7 +216,7 @@ impl<W: fmt::Write> Writer<W> {
 
         match ast.kind {
             Verbatim => self.wtr.write_char(ast.c),
-            Punctuation => write!(self.wtr, r"\{}", ast.c),
+            Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
             Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
             HexFixed(ast::HexLiteralKind::X) => {
                 write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
@@ -215,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) {
 
 /// Returns true if the given character has significance in a regex.
 ///
-/// These are the only characters that are allowed to be escaped, with one
-/// exception: an ASCII space character may be escaped when extended mode (with
-/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
-/// `false`.
+/// Generally speaking, these are the only characters which _must_ be escaped
+/// in order to match their literal meaning. For example, to match a literal
+/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
+/// example, `-` is treated as a meta character because of its significance
+/// for writing ranges inside of character classes, but the regex `-` will
+/// match a literal `-` because `-` has no special meaning outside of character
+/// classes.
+///
+/// In order to determine whether a character may be escaped at all, the
+/// [`is_escapeable_character`] routine should be used. The difference between
+/// `is_meta_character` and `is_escapeable_character` is that the latter will
+/// return true for some characters that are _not_ meta characters. For
+/// example, `%` and `\%` both match a literal `%` in all contexts. In other
+/// words, `is_escapeable_character` includes "superfluous" escapes.
 ///
 /// Note that the set of characters for which this function returns `true` or
-/// `false` is fixed and won't change in a semver compatible release.
+/// `false` is fixed and won't change in a semver compatible release. (In this
+/// case, "semver compatible release" actually refers to the `regex` crate
+/// itself, since reducing or expanding the set of meta characters would be a
+/// breaking change for not just `regex-syntax` but also `regex` itself.)
+///
+/// # Example
+///
+/// ```
+/// use regex_syntax::is_meta_character;
+///
+/// assert!(is_meta_character('?'));
+/// assert!(is_meta_character('-'));
+/// assert!(is_meta_character('&'));
+/// assert!(is_meta_character('#'));
+///
+/// assert!(!is_meta_character('%'));
+/// assert!(!is_meta_character('/'));
+/// assert!(!is_meta_character('!'));
+/// assert!(!is_meta_character('"'));
+/// assert!(!is_meta_character('e'));
+/// ```
 pub fn is_meta_character(c: char) -> bool {
     match c {
         '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
@@ -230,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool {
     }
 }
 
+/// Returns true if the given character can be escaped in a regex.
+///
+/// This returns true in all cases that `is_meta_character` returns true, but
+/// also returns true in some cases where `is_meta_character` returns false.
+/// For example, `%` is not a meta character, but it is escapeable. That is,
+/// `%` and `\%` both match a literal `%` in all contexts.
+///
+/// The purpose of this routine is to provide knowledge about what characters
+/// may be escaped. Namely, most regex engines permit "superfluous" escapes
+/// where characters without any special significance may be escaped even
+/// though there is no actual _need_ to do so.
+///
+/// This will return false for some characters. For example, `e` is not
+/// escapeable. Therefore, `\e` will either result in a parse error (which is
+/// true today), or it could backwards compatibly evolve into a new construct
+/// with its own meaning. Indeed, that is the purpose of banning _some_
+/// superfluous escapes: it provides a way to evolve the syntax in a compatible
+/// manner.
+///
+/// # Example
+///
+/// ```
+/// use regex_syntax::is_escapeable_character;
+///
+/// assert!(is_escapeable_character('?'));
+/// assert!(is_escapeable_character('-'));
+/// assert!(is_escapeable_character('&'));
+/// assert!(is_escapeable_character('#'));
+/// assert!(is_escapeable_character('%'));
+/// assert!(is_escapeable_character('/'));
+/// assert!(is_escapeable_character('!'));
+/// assert!(is_escapeable_character('"'));
+///
+/// assert!(!is_escapeable_character('e'));
+/// ```
+pub fn is_escapeable_character(c: char) -> bool {
+    // Certainly escapeable if it's a meta character.
+    if is_meta_character(c) {
+        return true;
+    }
+    // Any character that isn't ASCII is definitely not escapeable. There's
+    // no real need to allow things like \☃ right?
+    if !c.is_ascii() {
+        return false;
+    }
+    // Otherwise, we basically say that everything is escapeable unless it's a
+    // letter or digit. Things like \3 are either octal (when enabled) or an
+    // error, and we should keep it that way. Otherwise, letters are reserved
+    // for adding new syntax in a backwards compatible way.
+    match c {
+        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
+        // While not currently supported, we keep these as not escapeable to
+        // give us some flexibility with respect to supporting the \< and
+        // \> word boundary assertions in the future. By rejecting them as
+        // escapeable, \< and \> will result in a parse error. Thus, we can
+        // turn them into something else in the future without it being a
+        // backwards incompatible change.
+        '<' | '>' => false,
+        _ => true,
+    }
+}
+
 /// Returns true if and only if the given character is a Unicode word
 /// character.
 ///