From fe4667770bcc9c10f88cf47f0a532527657baabd Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Tue, 26 Dec 2023 04:24:27 +0100 Subject: [PATCH] Allow non UTF-8 regex parsing --- core/parser/src/lexer/regex.rs | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/core/parser/src/lexer/regex.rs b/core/parser/src/lexer/regex.rs index 97bbbd1e028..2a99e013f3b 100644 --- a/core/parser/src/lexer/regex.rs +++ b/core/parser/src/lexer/regex.rs @@ -108,19 +108,25 @@ impl Tokenizer for RegexLiteral { // SAFETY: We have already checked that the bytes are valid UTF-8. let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) }; - let mut body_str = String::with_capacity(body.len()); - for c in body { - if let Some(ch) = char::from_u32(c) { - body_str.push(ch); + let mut body_utf16 = Vec::new(); + + // We convert the body to UTF-16 since it may contain code points that are not valid UTF-8. + // We already know that the body is valid UTF-16. Casting is fine. + #[allow(clippy::cast_possible_truncation)] + for cp in &body { + let cp = *cp; + if cp <= 0xFFFF { + body_utf16.push(cp as u16); } else { - return Err(Error::Syntax( - "Invalid UTF-8 character in regular expressions".into(), - start_pos, - )); + let cp = cp - 0x1_0000; + let high = 0xD800 | ((cp >> 10) as u16); + let low = 0xDC00 | ((cp as u16) & 0x3FF); + body_utf16.push(high); + body_utf16.push(low); } } - if let Err(error) = Regex::with_flags(&body_str, flags_str) { + if let Err(error) = Regex::from_unicode(body.into_iter(), flags_str) { return Err(Error::Syntax( format!("Invalid regular expression literal: {error}").into(), start_pos, @@ -129,7 +135,7 @@ impl Tokenizer for RegexLiteral { Ok(Token::new( TokenKind::regular_expression_literal( - interner.get_or_intern(body_str.as_str()), + interner.get_or_intern(body_utf16.as_slice()), parse_regex_flags(flags_str, flags_start, interner)?, ), Span::new(start_pos, cursor.pos()),