Skip to content

Commit cc5084c

Browse files
committed
syntax: add support for CRLF-aware line anchors
This adds Look::StartCRLF and Look::EndCRLF. And also adds a new flag, 'R', for making ^/$ be CRLF aware in multi-line mode. The 'R' flag also causes '.' to *not* match \r in addition to \n (unless the 's' flag is enabled of course). The intended semantics are that CRLF mode makes \r\n, \r and \n line terminators but with one key property: \r\n is treated as a single line terminator. That is, ^/$ do not match between \r and \n. This partially addresses #244 by adding syntax support. Currently, if you try to use this new flag, the regex compiler will report an error. We intend to finish support for this once #656 is complete. (Indeed, at time of writing, CRLF matching works in regex-automata.)
1 parent 854e4dc commit cc5084c

File tree

8 files changed

+226
-28
lines changed

8 files changed

+226
-28
lines changed

regex-syntax/src/ast/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,8 @@ pub enum Flag {
13141314
SwapGreed,
13151315
/// `u`
13161316
Unicode,
1317+
/// `R`
1318+
CRLF,
13171319
/// `x`
13181320
IgnoreWhitespace,
13191321
}

regex-syntax/src/ast/parse.rs

+30
Original file line numberDiff line numberDiff line change
@@ -1381,6 +1381,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
13811381
's' => Ok(ast::Flag::DotMatchesNewLine),
13821382
'U' => Ok(ast::Flag::SwapGreed),
13831383
'u' => Ok(ast::Flag::Unicode),
1384+
'R' => Ok(ast::Flag::CRLF),
13841385
'x' => Ok(ast::Flag::IgnoreWhitespace),
13851386
_ => {
13861387
Err(self
@@ -4084,6 +4085,34 @@ bar
40844085
],
40854086
})
40864087
);
4088+
assert_eq!(
4089+
parser("i-sR:").parse_flags(),
4090+
Ok(ast::Flags {
4091+
span: span(0..4),
4092+
items: vec![
4093+
ast::FlagsItem {
4094+
span: span(0..1),
4095+
kind: ast::FlagsItemKind::Flag(
4096+
ast::Flag::CaseInsensitive
4097+
),
4098+
},
4099+
ast::FlagsItem {
4100+
span: span(1..2),
4101+
kind: ast::FlagsItemKind::Negation,
4102+
},
4103+
ast::FlagsItem {
4104+
span: span(2..3),
4105+
kind: ast::FlagsItemKind::Flag(
4106+
ast::Flag::DotMatchesNewLine
4107+
),
4108+
},
4109+
ast::FlagsItem {
4110+
span: span(3..4),
4111+
kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
4112+
},
4113+
],
4114+
})
4115+
);
40874116

40884117
assert_eq!(
40894118
parser("isU").parse_flags().unwrap_err(),
@@ -4145,6 +4174,7 @@ bar
41454174
assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
41464175
assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
41474176
assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4177+
assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
41484178
assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
41494179

41504180
assert_eq!(

regex-syntax/src/ast/print.rs

+1
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ impl<W: fmt::Write> Writer<W> {
289289
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
290290
Flag::SwapGreed => self.wtr.write_str("U"),
291291
Flag::Unicode => self.wtr.write_str("u"),
292+
Flag::CRLF => self.wtr.write_str("R"),
292293
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
293294
},
294295
}?;

regex-syntax/src/hir/mod.rs

+62-21
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,12 @@ impl Hir {
471471

472472
/// Returns an HIR expression for `.`.
473473
///
474-
/// * [`Dot::AnyChar`] maps to `(?su:.)`.
475-
/// * [`Dot::AnyByte`] maps to `(?s-u:.)`.
476-
/// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`.
477-
/// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`.
474+
/// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
475+
/// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
476+
/// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
477+
/// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
478+
/// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
479+
/// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
478480
///
479481
/// Note that this is a convenience routine for constructing the correct
480482
/// character class based on the value of `Dot`. There is no explicit "dot"
@@ -492,18 +494,32 @@ impl Hir {
492494
cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
493495
Hir::class(Class::Bytes(cls))
494496
}
495-
Dot::AnyCharExceptNL => {
497+
Dot::AnyCharExceptLF => {
496498
let mut cls = ClassUnicode::empty();
497499
cls.push(ClassUnicodeRange::new('\0', '\x09'));
498500
cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
499501
Hir::class(Class::Unicode(cls))
500502
}
501-
Dot::AnyByteExceptNL => {
503+
Dot::AnyCharExceptCRLF => {
504+
let mut cls = ClassUnicode::empty();
505+
cls.push(ClassUnicodeRange::new('\0', '\x09'));
506+
cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
507+
cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
508+
Hir::class(Class::Unicode(cls))
509+
}
510+
Dot::AnyByteExceptLF => {
502511
let mut cls = ClassBytes::empty();
503512
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
504513
cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
505514
Hir::class(Class::Bytes(cls))
506515
}
516+
Dot::AnyByteExceptCRLF => {
517+
let mut cls = ClassBytes::empty();
518+
cls.push(ClassBytesRange::new(b'\0', b'\x09'));
519+
cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
520+
cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
521+
Hir::class(Class::Bytes(cls))
522+
}
507523
}
508524
}
509525
}
@@ -1365,6 +1381,16 @@ pub enum Look {
13651381
/// at the end position of the input, or at the position immediately
13661382
/// preceding a `\n` character.
13671383
EndLF,
1384+
/// Match the beginning of a line or the beginning of text. Specifically,
1385+
/// this matches at the starting position of the input, or at the position
1386+
/// immediately following either a `\r` or `\n` character, but never after
1387+
/// a `\r` when a `\n` follows.
1388+
StartCRLF,
1389+
/// Match the end of a line or the end of text. Specifically, this matches
1390+
/// at the end position of the input, or at the position immediately
1391+
/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1392+
/// precedes it.
1393+
EndCRLF,
13681394
/// Match an ASCII-only word boundary. That is, this matches a position
13691395
/// where the left adjacent character and right adjacent character
13701396
/// correspond to a word and non-word or a non-word and word character.
@@ -1380,30 +1406,34 @@ pub enum Look {
13801406
}
13811407

13821408
impl Look {
1383-
fn from_repr(repr: u8) -> Option<Look> {
1409+
fn from_repr(repr: u16) -> Option<Look> {
13841410
match repr {
13851411
0 => Some(Look::Start),
13861412
1 => Some(Look::End),
13871413
2 => Some(Look::StartLF),
13881414
3 => Some(Look::EndLF),
1389-
4 => Some(Look::WordAscii),
1390-
5 => Some(Look::WordAsciiNegate),
1391-
6 => Some(Look::WordUnicode),
1392-
7 => Some(Look::WordUnicodeNegate),
1415+
4 => Some(Look::StartCRLF),
1416+
5 => Some(Look::EndCRLF),
1417+
6 => Some(Look::WordAscii),
1418+
7 => Some(Look::WordAsciiNegate),
1419+
8 => Some(Look::WordUnicode),
1420+
9 => Some(Look::WordUnicodeNegate),
13931421
_ => None,
13941422
}
13951423
}
13961424

1397-
fn as_repr(&self) -> u8 {
1425+
fn as_repr(&self) -> u16 {
13981426
match *self {
13991427
Look::Start => 0,
14001428
Look::End => 1,
14011429
Look::StartLF => 2,
14021430
Look::EndLF => 3,
1403-
Look::WordAscii => 4,
1404-
Look::WordAsciiNegate => 5,
1405-
Look::WordUnicode => 6,
1406-
Look::WordUnicodeNegate => 7,
1431+
Look::StartCRLF => 5,
1432+
Look::EndCRLF => 5,
1433+
Look::WordAscii => 6,
1434+
Look::WordAsciiNegate => 7,
1435+
Look::WordUnicode => 8,
1436+
Look::WordUnicodeNegate => 9,
14071437
}
14081438
}
14091439

@@ -1413,6 +1443,8 @@ impl Look {
14131443
Look::End => 'z',
14141444
Look::StartLF => '^',
14151445
Look::EndLF => '$',
1446+
Look::StartCRLF => '^',
1447+
Look::EndCRLF => '$',
14161448
Look::WordAscii => 'b',
14171449
Look::WordAsciiNegate => 'B',
14181450
Look::WordUnicode => '𝛃',
@@ -1505,11 +1537,20 @@ pub enum Dot {
15051537
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
15061538
///
15071539
/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1508-
AnyCharExceptNL,
1540+
AnyCharExceptLF,
1541+
/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1542+
/// and `\n`.
1543+
///
1544+
/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1545+
AnyCharExceptCRLF,
15091546
/// Matches any byte value except for `\n`.
15101547
///
15111548
/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1512-
AnyByteExceptNL,
1549+
AnyByteExceptLF,
1550+
/// Matches any byte value except for `\r` and `\n`.
1551+
///
1552+
/// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1553+
AnyByteExceptCRLF,
15131554
}
15141555

15151556
/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
@@ -2038,7 +2079,7 @@ impl Properties {
20382079
/// example, an [`Hir`] provides properties that return `LookSet`s.
20392080
#[derive(Clone, Copy, Default, Eq, PartialEq)]
20402081
pub struct LookSet {
2041-
bits: u8,
2082+
bits: u16,
20422083
}
20432084

20442085
impl LookSet {
@@ -2170,8 +2211,8 @@ impl Iterator for LookSetIter {
21702211
#[inline]
21712212
fn next(&mut self) -> Option<Look> {
21722213
// We'll never have more than u8::MAX distinct look-around assertions,
2173-
// so 'repr' will always fit into a usize.
2174-
let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap();
2214+
// so 'repr' will always fit into a u16.
2215+
let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
21752216
let look = Look::from_repr(repr)?;
21762217
self.set.remove(look);
21772218
Some(look)

regex-syntax/src/hir/print.rs

+6
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,12 @@ impl<W: fmt::Write> Visitor for Writer<W> {
177177
hir::Look::EndLF => {
178178
self.wtr.write_str("(?m:$)")?;
179179
}
180+
hir::Look::StartCRLF => {
181+
self.wtr.write_str("(?mR:^)")?;
182+
}
183+
hir::Look::EndCRLF => {
184+
self.wtr.write_str("(?mR:$)")?;
185+
}
180186
hir::Look::WordAscii => {
181187
self.wtr.write_str(r"(?-u:\b)")?;
182188
}

0 commit comments

Comments
 (0)