Skip to content

Commit 4a30dfb

Browse files
committed
syntax: reject '(?-u)\W' when UTF-8 mode is enabled
When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738
1 parent 2be7858 commit 4a30dfb

File tree

2 files changed

+92
-40
lines changed

2 files changed

+92
-40
lines changed

regex-syntax/src/hir/translate.rs

+84-11
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
305305
let hcls = hir::Class::Unicode(cls);
306306
self.push(HirFrame::Expr(Hir::class(hcls)));
307307
} else {
308-
let cls = self.hir_perl_byte_class(x);
308+
let cls = self.hir_perl_byte_class(x)?;
309309
let hcls = hir::Class::Bytes(cls);
310310
self.push(HirFrame::Expr(Hir::class(hcls)));
311311
}
@@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
445445
cls.union(&xcls);
446446
self.push(HirFrame::ClassUnicode(cls));
447447
} else {
448-
let xcls = self.hir_perl_byte_class(x);
448+
let xcls = self.hir_perl_byte_class(x)?;
449449
let mut cls = self.pop().unwrap().unwrap_class_bytes();
450450
cls.union(&xcls);
451451
self.push(HirFrame::ClassBytes(cls));
@@ -877,7 +877,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
877877
fn hir_perl_byte_class(
878878
&self,
879879
ast_class: &ast::ClassPerl,
880-
) -> hir::ClassBytes {
880+
) -> Result<hir::ClassBytes> {
881881
use crate::ast::ClassPerlKind::*;
882882

883883
assert!(!self.flags().unicode());
@@ -891,7 +891,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
891891
if ast_class.negated {
892892
class.negate();
893893
}
894-
class
894+
// Negating a Perl byte class is likely to cause it to match invalid
895+
// UTF-8. That's only OK if the translator is configured to allow such
896+
// things.
897+
if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
898+
return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
899+
}
900+
Ok(class)
895901
}
896902

897903
/// Converts the given Unicode specific error to an HIR translation error.
@@ -1969,7 +1975,7 @@ mod tests {
19691975

19701976
#[test]
19711977
#[cfg(feature = "unicode-perl")]
1972-
fn class_perl() {
1978+
fn class_perl_unicode() {
19731979
// Unicode
19741980
assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
19751981
assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2009,7 +2015,10 @@ mod tests {
20092015
);
20102016
#[cfg(feature = "unicode-case")]
20112017
assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2018+
}
20122019

2020+
#[test]
2021+
fn class_perl_ascii() {
20132022
// ASCII only
20142023
assert_eq!(
20152024
t(r"(?-u)\d"),
@@ -2038,29 +2047,93 @@ mod tests {
20382047

20392048
// ASCII only, negated
20402049
assert_eq!(
2041-
t(r"(?-u)\D"),
2050+
t_bytes(r"(?-u)\D"),
20422051
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20432052
);
20442053
assert_eq!(
2045-
t(r"(?-u)\S"),
2054+
t_bytes(r"(?-u)\S"),
20462055
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20472056
);
20482057
assert_eq!(
2049-
t(r"(?-u)\W"),
2058+
t_bytes(r"(?-u)\W"),
20502059
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20512060
);
20522061
assert_eq!(
2053-
t(r"(?i-u)\D"),
2062+
t_bytes(r"(?i-u)\D"),
20542063
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
20552064
);
20562065
assert_eq!(
2057-
t(r"(?i-u)\S"),
2066+
t_bytes(r"(?i-u)\S"),
20582067
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
20592068
);
20602069
assert_eq!(
2061-
t(r"(?i-u)\W"),
2070+
t_bytes(r"(?i-u)\W"),
20622071
hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
20632072
);
2073+
2074+
// ASCII only, negated, with UTF-8 mode enabled.
2075+
// In this case, negating any Perl class results in an error because
2076+
// all such classes can match invalid UTF-8.
2077+
assert_eq!(
2078+
t_err(r"(?-u)\D"),
2079+
TestError {
2080+
kind: hir::ErrorKind::InvalidUtf8,
2081+
span: Span::new(
2082+
Position::new(5, 1, 6),
2083+
Position::new(7, 1, 8),
2084+
),
2085+
},
2086+
);
2087+
assert_eq!(
2088+
t_err(r"(?-u)\S"),
2089+
TestError {
2090+
kind: hir::ErrorKind::InvalidUtf8,
2091+
span: Span::new(
2092+
Position::new(5, 1, 6),
2093+
Position::new(7, 1, 8),
2094+
),
2095+
},
2096+
);
2097+
assert_eq!(
2098+
t_err(r"(?-u)\W"),
2099+
TestError {
2100+
kind: hir::ErrorKind::InvalidUtf8,
2101+
span: Span::new(
2102+
Position::new(5, 1, 6),
2103+
Position::new(7, 1, 8),
2104+
),
2105+
},
2106+
);
2107+
assert_eq!(
2108+
t_err(r"(?i-u)\D"),
2109+
TestError {
2110+
kind: hir::ErrorKind::InvalidUtf8,
2111+
span: Span::new(
2112+
Position::new(6, 1, 7),
2113+
Position::new(8, 1, 9),
2114+
),
2115+
},
2116+
);
2117+
assert_eq!(
2118+
t_err(r"(?i-u)\S"),
2119+
TestError {
2120+
kind: hir::ErrorKind::InvalidUtf8,
2121+
span: Span::new(
2122+
Position::new(6, 1, 7),
2123+
Position::new(8, 1, 9),
2124+
),
2125+
},
2126+
);
2127+
assert_eq!(
2128+
t_err(r"(?i-u)\W"),
2129+
TestError {
2130+
kind: hir::ErrorKind::InvalidUtf8,
2131+
span: Span::new(
2132+
Position::new(6, 1, 7),
2133+
Position::new(8, 1, 9),
2134+
),
2135+
},
2136+
);
20642137
}
20652138

20662139
#[test]

tests/replace.rs

+8-29
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,11 @@ macro_rules! replace(
1212
replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6");
1313
replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z");
1414
replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
15-
replace!(
16-
groups,
17-
replace,
18-
r"(?-u)(\S+)\s+(\S+)",
19-
"w1 w2",
20-
t!("$2 $1"),
21-
"w2 w1"
22-
);
15+
replace!(groups, replace, r"(\S+)\s+(\S+)", "w1 w2", t!("$2 $1"), "w2 w1");
2316
replace!(
2417
double_dollar,
2518
replace,
26-
r"(?-u)(\S+)\s+(\S+)",
19+
r"(\S+)\s+(\S+)",
2720
"w1 w2",
2821
t!("$2 $$1"),
2922
"w2 $1"
@@ -33,7 +26,7 @@ replace!(
3326
replace!(
3427
named,
3528
replace_all,
36-
r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
29+
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
3730
"w1 w2 w3 w4",
3831
t!("$last $first$space"),
3932
"w2 w1 w4 w3"
@@ -48,42 +41,28 @@ replace!(
4841
);
4942
replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
5043
// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b");
51-
replace!(
52-
simple_expand,
53-
replace_all,
54-
r"(?-u)(\w) (\w)",
55-
"a b",
56-
t!("$2 $1"),
57-
"b a"
58-
);
59-
replace!(
60-
literal_dollar1,
61-
replace_all,
62-
r"(?-u)(\w+) (\w+)",
63-
"a b",
64-
t!("$$1"),
65-
"$1"
66-
);
44+
replace!(simple_expand, replace_all, r"(\w) (\w)", "a b", t!("$2 $1"), "b a");
45+
replace!(literal_dollar1, replace_all, r"(\w+) (\w+)", "a b", t!("$$1"), "$1");
6746
replace!(
6847
literal_dollar2,
6948
replace_all,
70-
r"(?-u)(\w+) (\w+)",
49+
r"(\w+) (\w+)",
7150
"a b",
7251
t!("$2 $$c $1"),
7352
"b $c a"
7453
);
7554
replace!(
7655
no_expand1,
7756
replace,
78-
r"(?-u)(\S+)\s+(\S+)",
57+
r"(\S+)\s+(\S+)",
7958
"w1 w2",
8059
no_expand!("$2 $1"),
8160
"$2 $1"
8261
);
8362
replace!(
8463
no_expand2,
8564
replace,
86-
r"(?-u)(\S+)\s+(\S+)",
65+
r"(\S+)\s+(\S+)",
8766
"w1 w2",
8867
no_expand!("$$1"),
8968
"$$1"

0 commit comments

Comments
 (0)