syntax: reject '(?-u)\W' when UTF-8 mode is enabled

BurntSushi · BurntSushi · commit d258c296ade7 · 2023-04-17T15:42:25.000-04:00
When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     let hcls = hir::Class::Unicode(cls);
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 } else {
-                    let cls = self.hir_perl_byte_class(x);
+                    let cls = self.hir_perl_byte_class(x)?;
                     let hcls = hir::Class::Bytes(cls);
                     self.push(HirFrame::Expr(Hir::class(hcls)));
                 }
@@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                     cls.union(&xcls);
                     self.push(HirFrame::ClassUnicode(cls));
                 } else {
-                    let xcls = self.hir_perl_byte_class(x);
+                    let xcls = self.hir_perl_byte_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
                     cls.union(&xcls);
                     self.push(HirFrame::ClassBytes(cls));
@@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
     fn hir_perl_byte_class(
         &self,
         ast_class: &ast::ClassPerl,
-    ) -> hir::ClassBytes {
+    ) -> Result<hir::ClassBytes> {
         use crate::ast::ClassPerlKind::*;
 
         assert!(!self.flags().unicode());
@@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         if ast_class.negated {
             class.negate();
         }
-        class
+        // Negating a Perl byte class is likely to cause it to match invalid
+        // UTF-8. That's only OK if the translator is configured to allow such
+        // things.
+        if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
+            return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
+        }
+        Ok(class)
     }
 
     /// Converts the given Unicode specific error to an HIR translation error.
@@ -1971,7 +1977,7 @@ mod tests {
 
     #[test]
     #[cfg(feature = "unicode-perl")]
-    fn class_perl() {
+    fn class_perl_unicode() {
         // Unicode
         assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
         assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
@@ -2011,7 +2017,10 @@ mod tests {
         );
         #[cfg(feature = "unicode-case")]
         assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
+    }
 
+    #[test]
+    fn class_perl_ascii() {
         // ASCII only
         assert_eq!(
             t(r"(?-u)\d"),
@@ -2040,29 +2049,93 @@ mod tests {
 
         // ASCII only, negated
         assert_eq!(
-            t(r"(?-u)\D"),
+            t_bytes(r"(?-u)\D"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
         );
         assert_eq!(
-            t(r"(?-u)\S"),
+            t_bytes(r"(?-u)\S"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
         );
         assert_eq!(
-            t(r"(?-u)\W"),
+            t_bytes(r"(?-u)\W"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
         );
         assert_eq!(
-            t(r"(?i-u)\D"),
+            t_bytes(r"(?i-u)\D"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
         );
         assert_eq!(
-            t(r"(?i-u)\S"),
+            t_bytes(r"(?i-u)\S"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
         );
         assert_eq!(
-            t(r"(?i-u)\W"),
+            t_bytes(r"(?i-u)\W"),
             hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
         );
+
+        // ASCII only, negated, with UTF-8 mode enabled.
+        // In this case, negating any Perl class results in an error because
+        // all such classes can match invalid UTF-8.
+        assert_eq!(
+            t_err(r"(?-u)\D"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?-u)\S"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?-u)\W"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(7, 1, 8),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\D"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\S"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
+        assert_eq!(
+            t_err(r"(?i-u)\W"),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(8, 1, 9),
+                ),
+            },
+        );
     }
 
     #[test]
diff --git a/tests/replace.rs b/tests/replace.rs
@@ -15,15 +15,15 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
 replace!(
     groups,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"([^ ]+)[ ]+([^ ]+)",
     "w1 w2",
     t!("$2 $1"),
     "w2 w1"
 );
 replace!(
     double_dollar,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"([^ ]+)[ ]+([^ ]+)",
     "w1 w2",
     t!("$2 $$1"),
     "w2 $1"
@@ -33,7 +33,7 @@ replace!(
 replace!(
     named,
     replace_all,
-    r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
+    r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)",
     "w1 w2 w3 w4",
     t!("$last $first$space"),
     "w2 w1 w4 w3"
@@ -51,39 +51,39 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
 replace!(
     simple_expand,
     replace_all,
-    r"(?-u)(\w) (\w)",
+    r"([a-z]) ([a-z])",
     "a b",
     t!("$2 $1"),
     "b a"
 );
 replace!(
     literal_dollar1,
     replace_all,
-    r"(?-u)(\w+) (\w+)",
+    r"([a-z]+) ([a-z]+)",
     "a b",
     t!("$$1"),
     "$1"
 );
 replace!(
     literal_dollar2,
     replace_all,
-    r"(?-u)(\w+) (\w+)",
+    r"([a-z]+) ([a-z]+)",
     "a b",
     t!("$2 $$c $1"),
     "b $c a"
 );
 replace!(
     no_expand1,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"([^ ]+)[ ]+([^ ]+)",
     "w1 w2",
     no_expand!("$2 $1"),
     "$2 $1"
 );
 replace!(
     no_expand2,
     replace,
-    r"(?-u)(\S+)\s+(\S+)",
+    r"([^ ]+)[ ]+([^ ]+)",
     "w1 w2",
     no_expand!("$$1"),
     "$$1"