From 43f7cb31c48b22c063fdf0c74859ec12e5a1e8ed Mon Sep 17 00:00:00 2001 From: Marijn Schouten Date: Sun, 2 Feb 2025 19:19:09 +0100 Subject: [PATCH] Restructure rustc_lexer::unescape Separate the functions for unescaping each kind of string and unit: - this duplicates some code, but also gets rid of code that is only there for genericity - each function is now simpler by inlining booleans, which might lead to faster code Use a Peekable> instead of going back and forth between string slice and chars iterator. - this gets rid of most position computations - allows removal of double traversal for correct backslash newline escapes in skip_ascii_whitespace Improves documentation --- compiler/rustc_ast/src/util/literal.rs | 11 +- compiler/rustc_lexer/src/unescape.rs | 557 ++++++++++-------- compiler/rustc_lexer/src/unescape/tests.rs | 6 +- compiler/rustc_parse/src/lexer/mod.rs | 6 +- .../crates/parser/src/lexed_str.rs | 42 +- .../crates/syntax/src/ast/token_ext.rs | 22 +- .../crates/syntax/src/validation.rs | 16 +- 7 files changed, 354 insertions(+), 306 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 4459cb962e8e9..4e1c6ed336d55 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use std::{ascii, fmt, str}; use rustc_lexer::unescape::{ - MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, + MixedUnit, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, }; use rustc_span::{Span, Symbol, kw, sym}; use tracing::debug; @@ -87,9 +87,8 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + unescape_str( s, - Mode::Str, &mut #[inline(always)] |_, c| match c { Ok(c) => buf.push(c), @@ -111,8 +110,8 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + unescape_byte_str(s, &mut |_, c| match c { + Ok(c) => buf.push(c), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") } @@ -128,7 +127,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { + unescape_cstr(s, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index d6ea4249247f3..ef1c7e39d391b 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -1,8 +1,9 @@ //! Utilities for validating string and char literals and turning them into //! values they represent. +use std::iter::{Peekable, from_fn}; use std::ops::Range; -use std::str::Chars; +use std::str::{CharIndices, Chars}; use Mode::*; @@ -80,33 +81,6 @@ impl EscapeError { } } -/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without -/// quotes) and produces a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, -/// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - match mode { - Char | Byte => { - let mut chars = src.chars(); - let res = unescape_char_or_byte(&mut chars, mode); - callback(0..(src.len() - chars.as_str().len()), res); - } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), - RawStr | RawByteStr => check_raw_common(src, mode, callback), - RawCStr => check_raw_common(src, mode, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - CStr => unreachable!(), - } -} - /// Used for mixed utf8 string literals, i.e. those that allow both unicode /// chars and high bytes. pub enum MixedUnit { @@ -138,139 +112,285 @@ impl From for MixedUnit { } } -/// Takes the contents of a mixed-utf8 literal (without quotes) and produces -/// a sequence of escaped characters or errors. -/// -/// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) +/// Takes the contents of a raw string literal (without quotes) and produces a +/// sequence of characters or errors, which are returned by invoking `callback`. +/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR. +fn check_raw_str(src: &str, callback: &mut F) where - F: FnMut(Range, Result), + F: FnMut(Range, Result), { - match mode { - CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), - } + src.char_indices().for_each(|(pos, c)| { + callback( + pos..pos + c.len_utf8(), + if c == '\r' { Err(EscapeError::BareCarriageReturnInRawString) } else { Ok(c) }, + ); + }); } -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error. -pub fn unescape_char(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Char) +/// Takes the contents of a raw byte string literal (without quotes) and produces a +/// sequence of characters or errors, which are returned by invoking `callback`. +/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR. +fn check_raw_byte_str(src: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + src.char_indices().for_each(|(pos, c)| { + callback( + pos..pos + c.len_utf8(), + if c == '\r' { + Err(EscapeError::BareCarriageReturnInRawString) + } else { + c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte) + }, + ); + }); } -/// Takes a contents of a byte literal (without quotes), and returns an -/// unescaped byte or an error. -pub fn unescape_byte(src: &str) -> Result { - unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) +/// Takes the contents of a raw C string literal (without quotes) and produces a +/// sequence of characters or errors, which are returned by invoking `callback`. +/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR. +fn check_raw_cstr(src: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + src.char_indices().for_each(|(pos, c)| { + callback(pos..pos + c.len_utf8(), match c { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + '\0' => Err(EscapeError::NulInCStr), + _ => Ok(c), + }); + }); } -/// What kind of literal do we parse. -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum Mode { - Char, - - Byte, - - Str, - RawStr, - - ByteStr, - RawByteStr, +/// Take the contents of a string literal (without quotes) +/// and produce a sequence of escaped characters or errors, +/// which are returned by invoking `callback`. +pub fn unescape_str(src: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut chars = src.char_indices().peekable(); + while let Some((start, c)) = chars.next() { + let res = match c { + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => { + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } + '\\' => scan_escape_for_char(&mut from_fn(|| chars.next().map(|i| i.1))), + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Ok(c), + }; + let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len()); + callback(start..end, res); + } +} - CStr, - RawCStr, +/// Take the contents of a byte string literal (without quotes) +/// and produce a sequence of unescaped bytes or errors, +/// which are returned by invoking `callback`. +pub fn unescape_byte_str(src: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut chars = src.char_indices().peekable(); + while let Some((start, c)) = chars.next() { + let res = match c { + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => { + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } + '\\' => scan_escape_for_byte(&mut from_fn(|| chars.next().map(|i| i.1))), + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte), + }; + let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len()); + callback(start..end, res); + } } -impl Mode { - pub fn in_double_quotes(self) -> bool { - match self { - Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, - Char | Byte => false, - } +/// Take the contents of a C string literal (without quotes) +/// and produce a sequence of unescaped characters or errors, +/// which are returned by invoking `callback`. +pub fn unescape_cstr(src: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut chars = src.char_indices().peekable(); + while let Some((start, c)) = chars.next() { + let res = match c { + // skip whitespace for backslash newline, see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => { + let mut callback_err = |range, err| callback(range, Err(err)); + skip_ascii_whitespace(&mut chars, start, &mut callback_err); + continue; + } + '\\' => scan_escape_for_cstr(&mut from_fn(|| chars.next().map(|i| i.1))), + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + '\0' => Err(EscapeError::NulInCStr), + c => Ok(c.into()), + }; + let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len()); + callback(start..end, res); } +} - /// Are `\x80`..`\xff` allowed? - fn allow_high_bytes(self) -> bool { - match self { - Char | Str => false, - Byte | ByteStr | CStr => true, - RawStr | RawByteStr | RawCStr => unreachable!(), - } +/// Skip ASCII whitespace. +/// Warns on unescaped newline and following non-ASCII whitespace. +fn skip_ascii_whitespace(chars: &mut Peekable>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + // the escaping slash and newline characters add 2 bytes + let mut end = start + 2; + let mut contains_nl = false; + while let Some((_, c)) = chars.next_if(|(_, c)| c.is_ascii_whitespace()) { + end += 1; + contains_nl = contains_nl || c == '\n'; } - /// Are unicode (non-ASCII) chars allowed? - #[inline] - fn allow_unicode_chars(self) -> bool { - match self { - Byte | ByteStr | RawByteStr => false, - Char | Str | RawStr | CStr | RawCStr => true, + if contains_nl { + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + if let Some((_, c)) = chars.peek() { + if c.is_whitespace() { + // for error reporting, include the character that was not skipped in the span + callback(start..end + c.len_utf8(), EscapeError::UnskippedWhitespaceWarning); } } +} - /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { - match self { - Byte | ByteStr => false, - Char | Str | CStr => true, - RawByteStr | RawStr | RawCStr => unreachable!(), - } +/// Takes the contents of a char literal (without quotes), +/// and returns an unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + unescape_char_iter(&mut src.chars()) +} + +fn unescape_char_iter(chars: &mut Chars<'_>) -> Result { + let res = match chars.next().ok_or(EscapeError::ZeroChars)? { + '\\' => scan_escape_for_char(chars), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => Ok(c), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); } + Ok(res) +} - pub fn prefix_noraw(self) -> &'static str { - match self { - Char | Str | RawStr => "", - Byte | ByteStr | RawByteStr => "b", - CStr | RawCStr => "c", - } +/// Takes the contents of a byte literal (without quotes), +/// and returns an unescaped byte or an error. +pub fn unescape_byte(src: &str) -> Result { + unescape_byte_iter(&mut src.chars()) +} + +fn unescape_byte_iter(chars: &mut Chars<'_>) -> Result { + let res = match chars.next().ok_or(EscapeError::ZeroChars)? { + '\\' => scan_escape_for_byte(chars), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + c => c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); } + Ok(res) } -fn scan_escape + From>( - chars: &mut Chars<'_>, - mode: Mode, -) -> Result { +/// Scan an escape sequence for a char +fn scan_escape_for_char(chars: &mut impl Iterator) -> Result { // Previous character was '\\', unescape what follows. - let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', + let next = chars.next().ok_or(EscapeError::LoneSlash)?; + simple_escape(next).map(|b| b as char).or_else(|c| match c { 'x' => { - // Parse hexadecimal character code. - - let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - - let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; - let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + let byte = hex_escape(chars)?; + if byte.is_ascii() { Ok(byte as char) } else { Err(EscapeError::OutOfRangeHexEscape) } + } + 'u' => { + let value = unicode_escape(chars)?; + if value > char::MAX as u32 { + Err(EscapeError::OutOfRangeUnicodeEscape) + } else { + char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape) + } + } + _ => Err(EscapeError::InvalidEscape), + }) +} - let value = (hi * 16 + lo) as u8; +/// Scan an escape sequence for a byte +fn scan_escape_for_byte(chars: &mut impl Iterator) -> Result { + // Previous character was '\\', unescape what follows. + let next = chars.next().ok_or(EscapeError::LoneSlash)?; + simple_escape(next).or_else(|c| match c { + 'x' => hex_escape(chars), + 'u' => { + let _ = unicode_escape(chars)?; + Err(EscapeError::UnicodeEscapeInByte) + } + _ => Err(EscapeError::InvalidEscape), + }) +} - return if !mode.allow_high_bytes() && !value.is_ascii() { - Err(EscapeError::OutOfRangeHexEscape) +fn scan_escape_for_cstr(chars: &mut impl Iterator) -> Result { + // Previous character was '\\', unescape what follows. + let next = chars.next().ok_or(EscapeError::LoneSlash)?; + simple_escape(next).map(MixedUnit::from).or_else(|c| match c { + 'x' => Ok(MixedUnit::from(hex_escape(chars)?)), + 'u' => { + let value = unicode_escape(chars)?; + if value > char::MAX as u32 { + Err(EscapeError::OutOfRangeUnicodeEscape) } else { - // This may be a high byte, but that will only happen if `T` is - // `MixedUnit`, because of the `allow_high_bytes` check above. - Ok(T::from(value)) - }; + char::from_u32(value) + .map(MixedUnit::Char) + .ok_or(EscapeError::LoneSurrogateUnicodeEscape) + } } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(T::from(res)) + + _ => Err(EscapeError::InvalidEscape), + }) +} + +/// Parse the character of an ASCII escape without the leading backslash. +fn simple_escape(c: char) -> Result { + // Previous character was '\\', unescape what follows. + Ok(match c { + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + '0' => b'\0', + _ => Err(c)?, + }) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { - // We've parsed '\u', now we have to parse '{..}'. +/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x". +fn hex_escape(chars: &mut impl Iterator) -> Result { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + Ok((hi * 16 + lo) as u8) +} + +/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape. +/// This r"{...}" normally comes after r"\u" and cannot start with an underscore. +fn unicode_escape(chars: &mut impl Iterator) -> Result { if chars.next() != Some('{') { return Err(EscapeError::NoBraceInUnicodeEscape); } @@ -290,23 +410,13 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result return Err(EscapeError::UnclosedUnicodeEscape), Some('_') => continue, Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if !allow_unicode_escapes { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or({ - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - }); + return if n_digits > 6 { + Err(EscapeError::OverlongUnicodeEscape) + } else { + Ok(value) + }; } Some(c) => { let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; @@ -321,118 +431,67 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } -} - -fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { - let c = chars.next().ok_or(EscapeError::ZeroChars)?; - let res = match c { - '\\' => scan_escape(chars, mode), - '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), - }?; - if chars.next().is_some() { - return Err(EscapeError::MoreThanOneChar); - } - Ok(res) -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) +/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without quotes) +/// and produces a sequence of unescaped characters or errors, +/// which are returned by invoking `callback`. +/// +/// For `Char` and `Byte` modes, the callback will be called exactly once. +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range, Result), + F: FnMut(Range, Result), { - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here is complicated because - // `skip_ascii_whitespace` makes us to skip over chars without counting - // them in the range computation. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\\' => { - match chars.clone().next() { - Some('\n') => { - // Rust language specification requires us to skip whitespaces - // if unescaped '\' character is followed by '\n'. - // For details see [Rust language reference] - // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, &mut |range, err| { - callback(range, Err(err)) - }); - continue; - } - _ => scan_escape::(&mut chars, mode), - } - } - '"' => Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); + let mut byte_callback = + |range, res: Result| callback(range, res.map(char::from)); + match mode { + Char => { + let mut chars = src.chars(); + let res = unescape_char_iter(&mut chars); + callback(0..(src.len() - chars.as_str().len()), res); + } + Byte => { + let mut chars = src.chars(); + let res = unescape_byte_iter(&mut chars).map(char::from); + callback(0..(src.len() - chars.as_str().len()), res); + } + Str => unescape_str(src, callback), + ByteStr => unescape_byte_str(src, &mut byte_callback), + RawStr => check_raw_str(src, callback), + RawByteStr => check_raw_byte_str(src, &mut byte_callback), + RawCStr => check_raw_cstr(src, callback), + CStr => unreachable!(), } } -fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) -where - F: FnMut(Range, EscapeError), -{ - let tail = chars.as_str(); - let first_non_space = tail - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; - callback(start..end, EscapeError::MultipleSkippedLinesWarning); - } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().next() { - if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, EscapeError::UnskippedWhitespaceWarning); - } - } - *chars = tail.chars(); +/// What kind of literal do we parse. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Mode { + Char, + + Byte, + + Str, + RawStr, + + ByteStr, + RawByteStr, + + CStr, + RawCStr, } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop - - // The `start` and `end` computation here matches the one in - // `unescape_non_raw_common` for consistency, even though this function - // doesn't have to worry about skipping any chars. - while let Some(c) = chars.next() { - let start = src.len() - chars.as_str().len() - c.len_utf8(); - let res = match c { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), - }; - let end = src.len() - chars.as_str().len(); - callback(start..end, res); +impl Mode { + pub fn in_double_quotes(self) -> bool { + match self { + Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, + Char | Byte => false, + } } -} -#[inline] -pub fn byte_from_char(c: char) -> u8 { - let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); - res as u8 + pub fn prefix_noraw(self) -> &'static str { + match self { + Char | Str | RawStr => "", + Byte | ByteStr | RawByteStr => "b", + CStr | RawCStr => "c", + } + } } diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 6fa7a150516b8..715037e606713 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -241,7 +241,11 @@ fn test_unescape_byte_str_good() { unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { - Ok(c) => b.push(byte_from_char(c)), + Ok(c) => { + let c = c as u32; + debug_assert!(c <= u8::MAX as u32, "guaranteed because of ByteStr"); + b.push(c as u8) + } Err(e) => buf = Err((range, e)), } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 792e2cc26ef1c..9aadcf12c57c4 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -985,10 +985,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { prefix_len: u32, postfix_len: u32, ) -> (token::LitKind, Symbol) { - self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape::unescape_mixed(src, mode, &mut |span, result| { - callback(span, result.map(drop)) - }) + self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, _mode, callback| { + unescape::unescape_cstr(src, &mut |span, result| callback(span, result.map(drop))) }) } } diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index c97596d5097ec..24a64ca31dac4 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -309,7 +309,11 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 1..][..len - 1]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - err = unescape_string_error_message(text, Mode::Str); + err = rustc_lexer::unescape::unescape_str(text, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, Mode::Str); + } + }); } STRING } @@ -320,7 +324,11 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 2..][..len - 2]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - err = unescape_string_error_message(text, Mode::ByteStr); + err = rustc_lexer::unescape::unescape_byte_str(text, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, Mode::ByteStr); + } + }); } BYTE_STRING } @@ -331,7 +339,11 @@ impl<'a> Converter<'a> { let text = &self.res.text[self.offset + 2..][..len - 2]; let i = text.rfind('"').unwrap(); let text = &text[..i]; - err = unescape_string_error_message(text, Mode::CStr); + err = rustc_lexer::unescape::unescape_cstr(text, &mut |_, res| { + if let Err(e) = res { + error_message = error_to_diagnostic_message(e, Mode::CStr); + } + }); } C_STRING } @@ -397,27 +409,3 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { EscapeError::MultipleSkippedLinesWarning => "", } } - -fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { - let mut error_message = ""; - match mode { - Mode::CStr => { - rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { - if let Err(e) = res { - error_message = error_to_diagnostic_message(e, mode); - } - }); - } - Mode::ByteStr | Mode::Str => { - rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { - if let Err(e) = res { - error_message = error_to_diagnostic_message(e, mode); - } - }); - } - _ => { - // Other Modes are not supported yet or do not apply - } - } - error_message -} diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index 7d5ca2704354d..92680afddf789 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -3,7 +3,8 @@ use std::{borrow::Cow, num::ParseIntError}; use rustc_lexer::unescape::{ - unescape_byte, unescape_char, unescape_mixed, unescape_unicode, EscapeError, MixedUnit, Mode, + unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, unescape_unicode, + EscapeError, MixedUnit, Mode, }; use stdx::always; @@ -218,7 +219,7 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = None; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( + unescape_str(text, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -259,18 +260,18 @@ impl ast::ByteString { let mut buf: Vec = Vec::new(); let mut prev_end = 0; let mut has_error = None; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( - unescaped_char, + unescape_byte_str(text, &mut |char_range, unescaped_byte| match ( + unescaped_byte, buf.capacity() == 0, ) { - (Ok(c), false) => buf.push(c as u8), + (Ok(b), false) => buf.push(b), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end } - (Ok(c), true) => { + (Ok(b), true) => { buf.reserve_exact(text.len()); buf.extend_from_slice(text[..prev_end].as_bytes()); - buf.push(c as u8); + buf.push(b); } (Err(e), _) => has_error = Some(e), }); @@ -297,7 +298,7 @@ impl IsString for ast::CString { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { + unescape_cstr(text, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); // XXX: This method should only be used for highlighting ranges. The unescaped @@ -323,10 +324,7 @@ impl ast::CString { MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), MixedUnit::HighByte(b) => buf.push(b), }; - unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( - unescaped, - buf.capacity() == 0, - ) { + unescape_cstr(text, &mut |char_range, unescaped| match (unescaped, buf.capacity() == 0) { (Ok(u), false) => extend_unit(&mut buf, u), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 13d352d3c691b..d14a39737995d 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,7 +5,9 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode}; +use rustc_lexer::unescape::{ + self, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, +}; use crate::{ algo, @@ -139,7 +141,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { + unescape_str(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -150,7 +152,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_byte_str(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -161,7 +163,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { + unescape_cstr(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -171,7 +173,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Char(_) => { if let Some(without_quotes) = unquote(text, 1, '\'') { - unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { + unescape_char(without_quotes, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -180,8 +182,8 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Byte(_) => { if let Some(without_quotes) = unquote(text, 2, '\'') { - unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { - if let Err(err) = char { + unescape_byte(without_quotes, &mut |range, byte| { + if let Err(err) = byte { push_err(2, range.start, err); } });