From fe4667770bcc9c10f88cf47f0a532527657baabd Mon Sep 17 00:00:00 2001
From: raskad <32105367+raskad@users.noreply.github.com>
Date: Tue, 26 Dec 2023 04:24:27 +0100
Subject: [PATCH] Allow non UTF-8 regex parsing

---
 core/parser/src/lexer/regex.rs | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)
diff --git a/core/parser/src/lexer/regex.rs b/core/parser/src/lexer/regex.rs
index 97bbbd1e028..2a99e013f3b 100644
--- a/core/parser/src/lexer/regex.rs
+++ b/core/parser/src/lexer/regex.rs
@@ -108,19 +108,25 @@ impl<R> Tokenizer<R> for RegexLiteral {
         // SAFETY: We have already checked that the bytes are valid UTF-8.
         let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
 
-        let mut body_str = String::with_capacity(body.len());
-        for c in body {
-            if let Some(ch) = char::from_u32(c) {
-                body_str.push(ch);
+        let mut body_utf16 = Vec::new();
+
+        // We convert the body to UTF-16 since it may contain code points that are not valid UTF-8.
+        // We already know that the body is valid UTF-16. Casting is fine.
+        #[allow(clippy::cast_possible_truncation)]
+        for cp in &body {
+            let cp = *cp;
+            if cp <= 0xFFFF {
+                body_utf16.push(cp as u16);
             } else {
-                return Err(Error::Syntax(
-                    "Invalid UTF-8 character in regular expressions".into(),
-                    start_pos,
-                ));
+                let cp = cp - 0x1_0000;
+                let high = 0xD800 | ((cp >> 10) as u16);
+                let low = 0xDC00 | ((cp as u16) & 0x3FF);
+                body_utf16.push(high);
+                body_utf16.push(low);
             }
         }
 
-        if let Err(error) = Regex::with_flags(&body_str, flags_str) {
+        if let Err(error) = Regex::from_unicode(body.into_iter(), flags_str) {
             return Err(Error::Syntax(
                 format!("Invalid regular expression literal: {error}").into(),
                 start_pos,
@@ -129,7 +135,7 @@ impl<R> Tokenizer<R> for RegexLiteral {
 
         Ok(Token::new(
             TokenKind::regular_expression_literal(
-                interner.get_or_intern(body_str.as_str()),
+                interner.get_or_intern(body_utf16.as_slice()),
                 parse_regex_flags(flags_str, flags_start, interner)?,
             ),
             Span::new(start_pos, cursor.pos()),