Skip to content

Commit fe8d667

Browse files
committed
syntax: permit most no-op escape sequences
This resolves a long-standing (but somewhat minor) complaint that folks have with the regex crate: it does not permit escaping punctuation characters in cases where those characters do not need to be escaped. So things like \/, \" and \! would result in parse errors. Most other regex engines permit these, even in cases where they aren't needed. I had been against doing this for future evolution purposes, but it's incredibly unlikely that we're ever going to add a new meta character to the syntax. I literally cannot think of any conceivable future in which that might happen. However, we do continue to ban escapes for [0-9A-Za-z<>], because it is conceivable that we might add new escape sequences for those characters. (And 0-9 are already banned by virtue of them looking too much like backreferences, which aren't supported.) For example, we could add \Q...\E literal syntax. Or \< and \> as start and end word boundaries, as found in POSIX regex engines. Fixes #501
1 parent 5178e7b commit fe8d667

File tree

4 files changed

+185
-61
lines changed

4 files changed

+185
-61
lines changed

regex-syntax/src/ast/mod.rs

+6-3
Original file line numberDiff line numberDiff line change
@@ -588,9 +588,12 @@ impl Literal {
588588
pub enum LiteralKind {
589589
/// The literal is written verbatim, e.g., `a` or `☃`.
590590
Verbatim,
591-
/// The literal is written as an escape because it is punctuation, e.g.,
592-
/// `\*` or `\[`.
593-
Punctuation,
591+
/// The literal is written as an escape because it is otherwise a special
592+
/// regex meta character, e.g., `\*` or `\[`.
593+
Meta,
594+
/// The literal is written as an escape despite the fact that the escape is
595+
/// unnecessary, e.g., `\%` or `\/`.
596+
Superfluous,
594597
/// The literal is written as an octal escape, e.g., `\141`.
595598
Octal,
596599
/// The literal is written as a hex code with a fixed number of digits

regex-syntax/src/ast/parse.rs

+81-52
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use alloc::{
1818
use crate::{
1919
ast::{self, Ast, Position, Span},
2020
either::Either,
21-
is_meta_character,
21+
is_escapeable_character, is_meta_character,
2222
};
2323

2424
type Result<T> = core::result::Result<T, ast::Error>;
@@ -1495,7 +1495,14 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
14951495
if is_meta_character(c) {
14961496
return Ok(Primitive::Literal(ast::Literal {
14971497
span,
1498-
kind: ast::LiteralKind::Punctuation,
1498+
kind: ast::LiteralKind::Meta,
1499+
c,
1500+
}));
1501+
}
1502+
if is_escapeable_character(c) {
1503+
return Ok(Primitive::Literal(ast::Literal {
1504+
span,
1505+
kind: ast::LiteralKind::Superfluous,
14991506
c,
15001507
}));
15011508
}
@@ -1513,9 +1520,6 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
15131520
'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
15141521
'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
15151522
'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1516-
' ' if self.ignore_whitespace() => {
1517-
special(ast::SpecialLiteralKind::Space, ' ')
1518-
}
15191523
'A' => Ok(Primitive::Assertion(ast::Assertion {
15201524
span,
15211525
kind: ast::AssertionKind::StartText,
@@ -2420,13 +2424,9 @@ mod tests {
24202424
lit_with(c, span(start..start + c.len_utf8()))
24212425
}
24222426

2423-
/// Create a punctuation literal starting at the given position.
2424-
fn punct_lit(c: char, span: Span) -> Ast {
2425-
Ast::Literal(ast::Literal {
2426-
span,
2427-
kind: ast::LiteralKind::Punctuation,
2428-
c,
2429-
})
2427+
/// Create a meta literal starting at the given position.
2428+
fn meta_lit(c: char, span: Span) -> Ast {
2429+
Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
24302430
}
24312431

24322432
/// Create a verbatim literal with the given span.
@@ -2710,24 +2710,24 @@ bar
27102710
Ok(concat(
27112711
0..36,
27122712
vec![
2713-
punct_lit('\\', span(0..2)),
2714-
punct_lit('.', span(2..4)),
2715-
punct_lit('+', span(4..6)),
2716-
punct_lit('*', span(6..8)),
2717-
punct_lit('?', span(8..10)),
2718-
punct_lit('(', span(10..12)),
2719-
punct_lit(')', span(12..14)),
2720-
punct_lit('|', span(14..16)),
2721-
punct_lit('[', span(16..18)),
2722-
punct_lit(']', span(18..20)),
2723-
punct_lit('{', span(20..22)),
2724-
punct_lit('}', span(22..24)),
2725-
punct_lit('^', span(24..26)),
2726-
punct_lit('$', span(26..28)),
2727-
punct_lit('#', span(28..30)),
2728-
punct_lit('&', span(30..32)),
2729-
punct_lit('-', span(32..34)),
2730-
punct_lit('~', span(34..36)),
2713+
meta_lit('\\', span(0..2)),
2714+
meta_lit('.', span(2..4)),
2715+
meta_lit('+', span(4..6)),
2716+
meta_lit('*', span(6..8)),
2717+
meta_lit('?', span(8..10)),
2718+
meta_lit('(', span(10..12)),
2719+
meta_lit(')', span(12..14)),
2720+
meta_lit('|', span(14..16)),
2721+
meta_lit('[', span(16..18)),
2722+
meta_lit(']', span(18..20)),
2723+
meta_lit('{', span(20..22)),
2724+
meta_lit('}', span(22..24)),
2725+
meta_lit('^', span(24..26)),
2726+
meta_lit('$', span(26..28)),
2727+
meta_lit('#', span(28..30)),
2728+
meta_lit('&', span(30..32)),
2729+
meta_lit('-', span(32..34)),
2730+
meta_lit('~', span(34..36)),
27312731
]
27322732
))
27332733
);
@@ -2879,23 +2879,12 @@ bar
28792879
flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
28802880
Ast::Literal(ast::Literal {
28812881
span: span_range(pat, 4..6),
2882-
kind: ast::LiteralKind::Special(
2883-
ast::SpecialLiteralKind::Space
2884-
),
2882+
kind: ast::LiteralKind::Superfluous,
28852883
c: ' ',
28862884
}),
28872885
]
28882886
))
28892887
);
2890-
// ... but only when `x` mode is enabled.
2891-
let pat = r"\ ";
2892-
assert_eq!(
2893-
parser(pat).parse().unwrap_err(),
2894-
TestError {
2895-
span: span_range(pat, 0..2),
2896-
kind: ast::ErrorKind::EscapeUnrecognized,
2897-
}
2898-
);
28992888
}
29002889

29012890
#[test]
@@ -4246,7 +4235,7 @@ bar
42464235
parser(r"\|").parse_primitive(),
42474236
Ok(Primitive::Literal(ast::Literal {
42484237
span: span(0..2),
4249-
kind: ast::LiteralKind::Punctuation,
4238+
kind: ast::LiteralKind::Meta,
42504239
c: '|',
42514240
}))
42524241
);
@@ -4297,11 +4286,26 @@ bar
42974286
}))
42984287
);
42994288

4289+
// We also support superfluous escapes in most cases now too.
4290+
for c in ['!', '@', '%', '"', '\'', '/', ' '] {
4291+
let pat = format!(r"\{}", c);
4292+
assert_eq!(
4293+
parser(&pat).parse_primitive(),
4294+
Ok(Primitive::Literal(ast::Literal {
4295+
span: span(0..2),
4296+
kind: ast::LiteralKind::Superfluous,
4297+
c,
4298+
}))
4299+
);
4300+
}
4301+
4302+
// Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
4303+
// gives flexibility for future evolution.
43004304
assert_eq!(
4301-
parser(r"\").parse_escape().unwrap_err(),
4305+
parser(r"\e").parse_escape().unwrap_err(),
43024306
TestError {
4303-
span: span(0..1),
4304-
kind: ast::ErrorKind::EscapeUnexpectedEof,
4307+
span: span(0..2),
4308+
kind: ast::ErrorKind::EscapeUnrecognized,
43054309
}
43064310
);
43074311
assert_eq!(
@@ -4311,6 +4315,31 @@ bar
43114315
kind: ast::ErrorKind::EscapeUnrecognized,
43124316
}
43134317
);
4318+
// But also, < and > are banned, so that we may evolve them into
4319+
// start/end word boundary assertions. (Not sure if we will...)
4320+
assert_eq!(
4321+
parser(r"\<").parse_escape().unwrap_err(),
4322+
TestError {
4323+
span: span(0..2),
4324+
kind: ast::ErrorKind::EscapeUnrecognized,
4325+
}
4326+
);
4327+
assert_eq!(
4328+
parser(r"\>").parse_escape().unwrap_err(),
4329+
TestError {
4330+
span: span(0..2),
4331+
kind: ast::ErrorKind::EscapeUnrecognized,
4332+
}
4333+
);
4334+
4335+
// An unfinished escape is illegal.
4336+
assert_eq!(
4337+
parser(r"\").parse_escape().unwrap_err(),
4338+
TestError {
4339+
span: span(0..1),
4340+
kind: ast::ErrorKind::EscapeUnexpectedEof,
4341+
}
4342+
);
43144343
}
43154344

43164345
#[test]
@@ -4907,7 +4936,7 @@ bar
49074936
lit(span(1..2), 'a'),
49084937
ast::ClassSetItem::Literal(ast::Literal {
49094938
span: span(2..4),
4910-
kind: ast::LiteralKind::Punctuation,
4939+
kind: ast::LiteralKind::Meta,
49114940
c: ']',
49124941
}),
49134942
]
@@ -4925,7 +4954,7 @@ bar
49254954
lit(span(1..2), 'a'),
49264955
ast::ClassSetItem::Literal(ast::Literal {
49274956
span: span(2..4),
4928-
kind: ast::LiteralKind::Punctuation,
4957+
kind: ast::LiteralKind::Meta,
49294958
c: '-',
49304959
}),
49314960
lit(span(4..5), 'z'),
@@ -5117,7 +5146,7 @@ bar
51175146
span(1..6),
51185147
itemset(ast::ClassSetItem::Literal(ast::Literal {
51195148
span: span(1..3),
5120-
kind: ast::LiteralKind::Punctuation,
5149+
kind: ast::LiteralKind::Meta,
51215150
c: '^',
51225151
})),
51235152
itemset(lit(span(5..6), '^')),
@@ -5133,7 +5162,7 @@ bar
51335162
span(1..6),
51345163
itemset(ast::ClassSetItem::Literal(ast::Literal {
51355164
span: span(1..3),
5136-
kind: ast::LiteralKind::Punctuation,
5165+
kind: ast::LiteralKind::Meta,
51375166
c: '&',
51385167
})),
51395168
itemset(lit(span(5..6), '&')),
@@ -5198,7 +5227,7 @@ bar
51985227
lit(span(1..2), ']'),
51995228
ast::ClassSetItem::Literal(ast::Literal {
52005229
span: span(2..4),
5201-
kind: ast::LiteralKind::Punctuation,
5230+
kind: ast::LiteralKind::Meta,
52025231
c: '[',
52035232
}),
52045233
]
@@ -5216,7 +5245,7 @@ bar
52165245
kind: itemset(ast::ClassSetItem::Literal(
52175246
ast::Literal {
52185247
span: span(1..3),
5219-
kind: ast::LiteralKind::Punctuation,
5248+
kind: ast::LiteralKind::Meta,
52205249
c: '[',
52215250
}
52225251
)),

regex-syntax/src/ast/print.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ impl<W: fmt::Write> Writer<W> {
216216

217217
match ast.kind {
218218
Verbatim => self.wtr.write_char(ast.c),
219-
Punctuation => write!(self.wtr, r"\{}", ast.c),
219+
Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
220220
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
221221
HexFixed(ast::HexLiteralKind::X) => {
222222
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))

regex-syntax/src/lib.rs

+97-5
Original file line numberDiff line numberDiff line change
@@ -215,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) {
215215

216216
/// Returns true if the given character has significance in a regex.
217217
///
218-
/// These are the only characters that are allowed to be escaped, with one
219-
/// exception: an ASCII space character may be escaped when extended mode (with
220-
/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
221-
/// `false`.
218+
/// Generally speaking, these are the only characters which _must_ be escaped
219+
/// in order to match their literal meaning. For example, to match a literal
220+
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
221+
/// example, `-` is treated as a meta character because of its significance
222+
/// for writing ranges inside of character classes, but the regex `-` will
223+
/// match a literal `-` because `-` has no special meaning outside of character
224+
/// classes.
225+
///
226+
/// In order to determine whether a character may be escaped at all, the
227+
/// [`is_escapeable_character`] routine should be used. The difference between
228+
/// `is_meta_character` and `is_escapeable_character` is that the latter will
229+
/// return true for some characters that are _not_ meta characters. For
230+
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
231+
/// words, `is_escapeable_character` includes "superfluous" escapes.
222232
///
223233
/// Note that the set of characters for which this function returns `true` or
224-
/// `false` is fixed and won't change in a semver compatible release.
234+
/// `false` is fixed and won't change in a semver compatible release. (In this
235+
/// case, "semver compatible release" actually refers to the `regex` crate
236+
/// itself, since reducing or expanding the set of meta characters would be a
237+
/// breaking change for not just `regex-syntax` but also `regex` itself.)
238+
///
239+
/// # Example
240+
///
241+
/// ```
242+
/// use regex_syntax::is_meta_character;
243+
///
244+
/// assert!(is_meta_character('?'));
245+
/// assert!(is_meta_character('-'));
246+
/// assert!(is_meta_character('&'));
247+
/// assert!(is_meta_character('#'));
248+
///
249+
/// assert!(!is_meta_character('%'));
250+
/// assert!(!is_meta_character('/'));
251+
/// assert!(!is_meta_character('!'));
252+
/// assert!(!is_meta_character('"'));
253+
/// assert!(!is_meta_character('e'));
254+
/// ```
225255
pub fn is_meta_character(c: char) -> bool {
226256
match c {
227257
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
@@ -230,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool {
230260
}
231261
}
232262

263+
/// Returns true if the given character can be escaped in a regex.
264+
///
265+
/// This returns true in all cases that `is_meta_character` returns true, but
266+
/// also returns true in some cases where `is_meta_character` returns false.
267+
/// For example, `%` is not a meta character, but it is escapeable. That is,
268+
/// `%` and `\%` both match a literal `%` in all contexts.
269+
///
270+
/// The purpose of this routine is to provide knowledge about what characters
271+
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
272+
/// where characters without any special significance may be escaped even
273+
/// though there is no actual _need_ to do so.
274+
///
275+
/// This will return false for some characters. For example, `e` is not
276+
/// escapeable. Therefore, `\e` will either result in a parse error (which is
277+
/// true today), or it could backwards compatibly evolve into a new construct
278+
/// with its own meaning. Indeed, that is the purpose of banning _some_
279+
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
280+
/// manner.
281+
///
282+
/// # Example
283+
///
284+
/// ```
285+
/// use regex_syntax::is_escapeable_character;
286+
///
287+
/// assert!(is_escapeable_character('?'));
288+
/// assert!(is_escapeable_character('-'));
289+
/// assert!(is_escapeable_character('&'));
290+
/// assert!(is_escapeable_character('#'));
291+
/// assert!(is_escapeable_character('%'));
292+
/// assert!(is_escapeable_character('/'));
293+
/// assert!(is_escapeable_character('!'));
294+
/// assert!(is_escapeable_character('"'));
295+
///
296+
/// assert!(!is_escapeable_character('e'));
297+
/// ```
298+
pub fn is_escapeable_character(c: char) -> bool {
299+
// Certainly escapeable if it's a meta character.
300+
if is_meta_character(c) {
301+
return true;
302+
}
303+
// Any character that isn't ASCII is definitely not escapeable. There's
304+
// no real need to allow things like \☃ right?
305+
if !c.is_ascii() {
306+
return false;
307+
}
308+
// Otherwise, we basically say that everything is escapeable unless it's a
309+
// letter or digit. Things like \3 are either octal (when enabled) or an
310+
// error, and we should keep it that way. Otherwise, letters are reserved
311+
// for adding new syntax in a backwards compatible way.
312+
match c {
313+
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
314+
// While not currently supported, we keep these as not escapeable to
315+
// give us some flexibility with respect to supporting the \< and
316+
// \> word boundary assertions in the future. By rejecting them as
317+
// escapeable, \< and \> will result in a parse error. Thus, we can
318+
// turn them into something else in the future without it being a
319+
// backwards incompatible change.
320+
'<' | '>' => false,
321+
_ => true,
322+
}
323+
}
324+
233325
/// Returns true if and only if the given character is a Unicode word
234326
/// character.
235327
///

0 commit comments

Comments
 (0)