Skip to content

Commit d258c29

Browse files
committed
syntax: reject '(?-u)\W' when UTF-8 mode is enabled
When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738
1 parent 78c647d commit d258c29

File tree

2 files changed

+92
-19
lines changed

2 files changed

+92
-19
lines changed

regex-syntax/src/hir/translate.rs

+84-11
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
305305
let hcls = hir::Class::Unicode(cls);
306306
self.push(HirFrame::Expr(Hir::class(hcls)));
307307
} else {
308-
let cls = self.hir_perl_byte_class(x);
308+
let cls = self.hir_perl_byte_class(x)?;
309309
let hcls = hir::Class::Bytes(cls);
310310
self.push(HirFrame::Expr(Hir::class(hcls)));
311311
}
@@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
445445
cls.union(&xcls);
446446
self.push(HirFrame::ClassUnicode(cls));
447447
} else {
448-
let xcls = self.hir_perl_byte_class(x);
448+
let xcls = self.hir_perl_byte_class(x)?;
449449
let mut cls = self.pop().unwrap().unwrap_class_bytes();
450450
cls.union(&xcls);
451451
self.push(HirFrame::ClassBytes(cls));
@@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
879879
fn hir_perl_byte_class(
880880
&self,
881881
ast_class: &ast::ClassPerl,
882-
) -> hir::ClassBytes {
882+
) -> Result<hir::ClassBytes> {
883883
use crate::ast::ClassPerlKind::*;
884884

885885
assert!(!self.flags().unicode());
@@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
893893
if ast_class.negated {
894894
class.negate();
895895
}
896-
class
896+
// Negating a Perl byte class is likely to cause it to match invalid
897+
// UTF-8. That's only OK if the translator is configured to allow such
898+
// things.
899+
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
900+
return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
901+
}
902+
Ok(class)
897903
}
898904

899905
/// Converts the given Unicode specific error to an HIR translation error.
@@ -1971,7 +1977,7 @@ mod tests {
19711977

19721978
#[test]
19731979
#[cfg(feature = "unicode-perl")]
1974-
fn class_perl() {
1980+
fn class_perl_unicode() {
19751981
// Unicode
19761982
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
19771983
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2011,7 +2017,10 @@ mod tests {
20112017
);
20122018
#[cfg(feature = "unicode-case")]
20132019
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2020+
}
20142021

2022+
#[test]
2023+
fn class_perl_ascii() {
20152024
// ASCII only
20162025
assert_eq!(
20172026
t(r"(?-u)\d"),
@@ -2040,29 +2049,93 @@ mod tests {
20402049

20412050
// ASCII only, negated
20422051
assert_eq!(
2043-
t(r"(?-u)\D"),
2052+
t_bytes(r"(?-u)\D"),
20442053
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20452054
);
20462055
assert_eq!(
2047-
t(r"(?-u)\S"),
2056+
t_bytes(r"(?-u)\S"),
20482057
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20492058
);
20502059
assert_eq!(
2051-
t(r"(?-u)\W"),
2060+
t_bytes(r"(?-u)\W"),
20522061
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20532062
);
20542063
assert_eq!(
2055-
t(r"(?i-u)\D"),
2064+
t_bytes(r"(?i-u)\D"),
20562065
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20572066
);
20582067
assert_eq!(
2059-
t(r"(?i-u)\S"),
2068+
t_bytes(r"(?i-u)\S"),
20602069
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20612070
);
20622071
assert_eq!(
2063-
t(r"(?i-u)\W"),
2072+
t_bytes(r"(?i-u)\W"),
20642073
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20652074
);
2075+
2076+
// ASCII only, negated, with UTF-8 mode enabled.
2077+
// In this case, negating any Perl class results in an error because
2078+
// all such classes can match invalid UTF-8.
2079+
assert_eq!(
2080+
t_err(r"(?-u)\D"),
2081+
TestError {
2082+
kind: hir::ErrorKind::InvalidUtf8,
2083+
span: Span::new(
2084+
Position::new(5, 1, 6),
2085+
Position::new(7, 1, 8),
2086+
),
2087+
},
2088+
);
2089+
assert_eq!(
2090+
t_err(r"(?-u)\S"),
2091+
TestError {
2092+
kind: hir::ErrorKind::InvalidUtf8,
2093+
span: Span::new(
2094+
Position::new(5, 1, 6),
2095+
Position::new(7, 1, 8),
2096+
),
2097+
},
2098+
);
2099+
assert_eq!(
2100+
t_err(r"(?-u)\W"),
2101+
TestError {
2102+
kind: hir::ErrorKind::InvalidUtf8,
2103+
span: Span::new(
2104+
Position::new(5, 1, 6),
2105+
Position::new(7, 1, 8),
2106+
),
2107+
},
2108+
);
2109+
assert_eq!(
2110+
t_err(r"(?i-u)\D"),
2111+
TestError {
2112+
kind: hir::ErrorKind::InvalidUtf8,
2113+
span: Span::new(
2114+
Position::new(6, 1, 7),
2115+
Position::new(8, 1, 9),
2116+
),
2117+
},
2118+
);
2119+
assert_eq!(
2120+
t_err(r"(?i-u)\S"),
2121+
TestError {
2122+
kind: hir::ErrorKind::InvalidUtf8,
2123+
span: Span::new(
2124+
Position::new(6, 1, 7),
2125+
Position::new(8, 1, 9),
2126+
),
2127+
},
2128+
);
2129+
assert_eq!(
2130+
t_err(r"(?i-u)\W"),
2131+
TestError {
2132+
kind: hir::ErrorKind::InvalidUtf8,
2133+
span: Span::new(
2134+
Position::new(6, 1, 7),
2135+
Position::new(8, 1, 9),
2136+
),
2137+
},
2138+
);
20662139
}
20672140

20682141
#[test]

tests/replace.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
1515
replace!(
1616
groups,
1717
replace,
18-
r"(?-u)(\S+)\s+(\S+)",
18+
r"([^ ]+)[ ]+([^ ]+)",
1919
"w1 w2",
2020
t!("$2 $1"),
2121
"w2 w1"
2222
);
2323
replace!(
2424
double_dollar,
2525
replace,
26-
r"(?-u)(\S+)\s+(\S+)",
26+
r"([^ ]+)[ ]+([^ ]+)",
2727
"w1 w2",
2828
t!("$2 $$1"),
2929
"w2 $1"
@@ -33,7 +33,7 @@ replace!(
3333
replace!(
3434
named,
3535
replace_all,
36-
r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
36+
r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)",
3737
"w1 w2 w3 w4",
3838
t!("$last $first$space"),
3939
"w2 w1 w4 w3"
@@ -51,39 +51,39 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
5151
replace!(
5252
simple_expand,
5353
replace_all,
54-
r"(?-u)(\w) (\w)",
54+
r"([a-z]) ([a-z])",
5555
"a b",
5656
t!("$2 $1"),
5757
"b a"
5858
);
5959
replace!(
6060
literal_dollar1,
6161
replace_all,
62-
r"(?-u)(\w+) (\w+)",
62+
r"([a-z]+) ([a-z]+)",
6363
"a b",
6464
t!("$$1"),
6565
"$1"
6666
);
6767
replace!(
6868
literal_dollar2,
6969
replace_all,
70-
r"(?-u)(\w+) (\w+)",
70+
r"([a-z]+) ([a-z]+)",
7171
"a b",
7272
t!("$2 $$c $1"),
7373
"b $c a"
7474
);
7575
replace!(
7676
no_expand1,
7777
replace,
78-
r"(?-u)(\S+)\s+(\S+)",
78+
r"([^ ]+)[ ]+([^ ]+)",
7979
"w1 w2",
8080
no_expand!("$2 $1"),
8181
"$2 $1"
8282
);
8383
replace!(
8484
no_expand2,
8585
replace,
86-
r"(?-u)(\S+)\s+(\S+)",
86+
r"([^ ]+)[ ]+([^ ]+)",
8787
"w1 w2",
8888
no_expand!("$$1"),
8989
"$$1"

0 commit comments

Comments
 (0)