From 43f7cb31c48b22c063fdf0c74859ec12e5a1e8ed Mon Sep 17 00:00:00 2001
From: Marijn Schouten <mhkbst@gmail.com>
Date: Sun, 2 Feb 2025 19:19:09 +0100
Subject: [PATCH] Restructure rustc_lexer::unescape

Separate the functions for unescaping each kind of string and unit:
 - this duplicates some code, but also gets rid of code that is only there for genericity
 - each function is now simpler by inlining booleans, which might lead to faster code

Use a Peekable<CharIndices<'_>> instead of going back and forth between string slice and chars iterator.
 - this gets rid of most position computations
 - allows removal of double traversal for correct backslash newline escapes in skip_ascii_whitespace

Improves documentation
---
 compiler/rustc_ast/src/util/literal.rs        |  11 +-
 compiler/rustc_lexer/src/unescape.rs          | 557 ++++++++++--------
 compiler/rustc_lexer/src/unescape/tests.rs    |   6 +-
 compiler/rustc_parse/src/lexer/mod.rs         |   6 +-
 .../crates/parser/src/lexed_str.rs            |  42 +-
 .../crates/syntax/src/ast/token_ext.rs        |  22 +-
 .../crates/syntax/src/validation.rs           |  16 +-
 7 files changed, 354 insertions(+), 306 deletions(-)

diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index 4459cb962e8e9..4e1c6ed336d55 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -3,7 +3,7 @@
 use std::{ascii, fmt, str};
 
 use rustc_lexer::unescape::{
-    MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode,
+    MixedUnit, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str,
 };
 use rustc_span::{Span, Symbol, kw, sym};
 use tracing::debug;
@@ -87,9 +87,8 @@ impl LitKind {
                     // Force-inlining here is aggressive but the closure is
                     // called on every char in the string, so it can be hot in
                     // programs with many long strings containing escapes.
-                    unescape_unicode(
+                    unescape_str(
                         s,
-                        Mode::Str,
                         &mut #[inline(always)]
                         |_, c| match c {
                             Ok(c) => buf.push(c),
@@ -111,8 +110,8 @@ impl LitKind {
             token::ByteStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
-                    Ok(c) => buf.push(byte_from_char(c)),
+                unescape_byte_str(s, &mut |_, c| match c {
+                    Ok(c) => buf.push(c),
                     Err(err) => {
                         assert!(!err.is_fatal(), "failed to unescape string literal")
                     }
@@ -128,7 +127,7 @@ impl LitKind {
             token::CStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
+                unescape_cstr(s, &mut |_span, c| match c {
                     Ok(MixedUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index d6ea4249247f3..ef1c7e39d391b 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -1,8 +1,9 @@
 //! Utilities for validating string and char literals and turning them into
 //! values they represent.
 
+use std::iter::{Peekable, from_fn};
 use std::ops::Range;
-use std::str::Chars;
+use std::str::{CharIndices, Chars};
 
 use Mode::*;
 
@@ -80,33 +81,6 @@ impl EscapeError {
     }
 }
 
-/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
-/// quotes) and produces a sequence of escaped characters or errors.
-///
-/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
-/// the callback will be called exactly once.
-pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<char, EscapeError>),
-{
-    match mode {
-        Char | Byte => {
-            let mut chars = src.chars();
-            let res = unescape_char_or_byte(&mut chars, mode);
-            callback(0..(src.len() - chars.as_str().len()), res);
-        }
-        Str | ByteStr => unescape_non_raw_common(src, mode, callback),
-        RawStr | RawByteStr => check_raw_common(src, mode, callback),
-        RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
-            if let Ok('\0') = result {
-                result = Err(EscapeError::NulInCStr);
-            }
-            callback(r, result)
-        }),
-        CStr => unreachable!(),
-    }
-}
-
 /// Used for mixed utf8 string literals, i.e. those that allow both unicode
 /// chars and high bytes.
 pub enum MixedUnit {
@@ -138,139 +112,285 @@ impl From<u8> for MixedUnit {
     }
 }
 
-/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
-/// a sequence of escaped characters or errors.
-///
-/// Values are returned by invoking `callback`.
-pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
+/// Takes the contents of a raw string literal (without quotes) and produces a
+/// sequence of characters or errors, which are returned by invoking `callback`.
+/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR.
+fn check_raw_str<F>(src: &str, callback: &mut F)
 where
-    F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
-    match mode {
-        CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
-            if let Ok(MixedUnit::Char('\0')) = result {
-                result = Err(EscapeError::NulInCStr);
-            }
-            callback(r, result)
-        }),
-        Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
-    }
+    src.char_indices().for_each(|(pos, c)| {
+        callback(
+            pos..pos + c.len_utf8(),
+            if c == '\r' { Err(EscapeError::BareCarriageReturnInRawString) } else { Ok(c) },
+        );
+    });
 }
 
-/// Takes a contents of a char literal (without quotes), and returns an
-/// unescaped char or an error.
-pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Char)
+/// Takes the contents of a raw byte string literal (without quotes) and produces a
+/// sequence of characters or errors, which are returned by invoking `callback`.
+/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR.
+fn check_raw_byte_str<F>(src: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<u8, EscapeError>),
+{
+    src.char_indices().for_each(|(pos, c)| {
+        callback(
+            pos..pos + c.len_utf8(),
+            if c == '\r' {
+                Err(EscapeError::BareCarriageReturnInRawString)
+            } else {
+                c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte)
+            },
+        );
+    });
 }
 
-/// Takes a contents of a byte literal (without quotes), and returns an
-/// unescaped byte or an error.
-pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
-    unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
+/// Takes the contents of a raw C string literal (without quotes) and produces a
+/// sequence of characters or errors, which are returned by invoking `callback`.
+/// NOTE: Raw strings don't do any unescaping, but do produce errors on bare CR.
+fn check_raw_cstr<F>(src: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
+{
+    src.char_indices().for_each(|(pos, c)| {
+        callback(pos..pos + c.len_utf8(), match c {
+            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
+            '\0' => Err(EscapeError::NulInCStr),
+            _ => Ok(c),
+        });
+    });
 }
 
-/// What kind of literal do we parse.
-#[derive(Debug, Clone, Copy, PartialEq)]
-pub enum Mode {
-    Char,
-
-    Byte,
-
-    Str,
-    RawStr,
-
-    ByteStr,
-    RawByteStr,
+/// Take the contents of a string literal (without quotes)
+/// and produce a sequence of escaped characters or errors,
+/// which are returned by invoking `callback`.
+pub fn unescape_str<F>(src: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
+{
+    let mut chars = src.char_indices().peekable();
+    while let Some((start, c)) = chars.next() {
+        let res = match c {
+            // skip whitespace for backslash newline, see [Rust language reference]
+            // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
+            '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => {
+                let mut callback_err = |range, err| callback(range, Err(err));
+                skip_ascii_whitespace(&mut chars, start, &mut callback_err);
+                continue;
+            }
+            '\\' => scan_escape_for_char(&mut from_fn(|| chars.next().map(|i| i.1))),
+            '"' => Err(EscapeError::EscapeOnlyChar),
+            '\r' => Err(EscapeError::BareCarriageReturn),
+            c => Ok(c),
+        };
+        let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len());
+        callback(start..end, res);
+    }
+}
 
-    CStr,
-    RawCStr,
+/// Take the contents of a byte string literal (without quotes)
+/// and produce a sequence of unescaped bytes or errors,
+/// which are returned by invoking `callback`.
+pub fn unescape_byte_str<F>(src: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<u8, EscapeError>),
+{
+    let mut chars = src.char_indices().peekable();
+    while let Some((start, c)) = chars.next() {
+        let res = match c {
+            // skip whitespace for backslash newline, see [Rust language reference]
+            // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
+            '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => {
+                let mut callback_err = |range, err| callback(range, Err(err));
+                skip_ascii_whitespace(&mut chars, start, &mut callback_err);
+                continue;
+            }
+            '\\' => scan_escape_for_byte(&mut from_fn(|| chars.next().map(|i| i.1))),
+            '"' => Err(EscapeError::EscapeOnlyChar),
+            '\r' => Err(EscapeError::BareCarriageReturn),
+            c => c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte),
+        };
+        let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len());
+        callback(start..end, res);
+    }
 }
 
-impl Mode {
-    pub fn in_double_quotes(self) -> bool {
-        match self {
-            Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
-            Char | Byte => false,
-        }
+/// Take the contents of a C string literal (without quotes)
+/// and produce a sequence of unescaped characters or errors,
+/// which are returned by invoking `callback`.
+pub fn unescape_cstr<F>(src: &str, callback: &mut F)
+where
+    F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
+{
+    let mut chars = src.char_indices().peekable();
+    while let Some((start, c)) = chars.next() {
+        let res = match c {
+            // skip whitespace for backslash newline, see [Rust language reference]
+            // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
+            '\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => {
+                let mut callback_err = |range, err| callback(range, Err(err));
+                skip_ascii_whitespace(&mut chars, start, &mut callback_err);
+                continue;
+            }
+            '\\' => scan_escape_for_cstr(&mut from_fn(|| chars.next().map(|i| i.1))),
+            '"' => Err(EscapeError::EscapeOnlyChar),
+            '\r' => Err(EscapeError::BareCarriageReturn),
+            '\0' => Err(EscapeError::NulInCStr),
+            c => Ok(c.into()),
+        };
+        let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len());
+        callback(start..end, res);
     }
+}
 
-    /// Are `\x80`..`\xff` allowed?
-    fn allow_high_bytes(self) -> bool {
-        match self {
-            Char | Str => false,
-            Byte | ByteStr | CStr => true,
-            RawStr | RawByteStr | RawCStr => unreachable!(),
-        }
+/// Skip ASCII whitespace.
+/// Warns on unescaped newline and following non-ASCII whitespace.
+fn skip_ascii_whitespace<F>(chars: &mut Peekable<CharIndices<'_>>, start: usize, callback: &mut F)
+where
+    F: FnMut(Range<usize>, EscapeError),
+{
+    // the escaping slash and newline characters add 2 bytes
+    let mut end = start + 2;
+    let mut contains_nl = false;
+    while let Some((_, c)) = chars.next_if(|(_, c)| c.is_ascii_whitespace()) {
+        end += 1;
+        contains_nl = contains_nl || c == '\n';
     }
 
-    /// Are unicode (non-ASCII) chars allowed?
-    #[inline]
-    fn allow_unicode_chars(self) -> bool {
-        match self {
-            Byte | ByteStr | RawByteStr => false,
-            Char | Str | RawStr | CStr | RawCStr => true,
+    if contains_nl {
+        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
+    }
+    if let Some((_, c)) = chars.peek() {
+        if c.is_whitespace() {
+            // for error reporting, include the character that was not skipped in the span
+            callback(start..end + c.len_utf8(), EscapeError::UnskippedWhitespaceWarning);
         }
     }
+}
 
-    /// Are unicode escapes (`\u`) allowed?
-    fn allow_unicode_escapes(self) -> bool {
-        match self {
-            Byte | ByteStr => false,
-            Char | Str | CStr => true,
-            RawByteStr | RawStr | RawCStr => unreachable!(),
-        }
+/// Takes the contents of a char literal (without quotes),
+/// and returns an unescaped char or an error.
+pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
+    unescape_char_iter(&mut src.chars())
+}
+
+fn unescape_char_iter(chars: &mut Chars<'_>) -> Result<char, EscapeError> {
+    let res = match chars.next().ok_or(EscapeError::ZeroChars)? {
+        '\\' => scan_escape_for_char(chars),
+        '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
+        '\r' => Err(EscapeError::BareCarriageReturn),
+        c => Ok(c),
+    }?;
+    if chars.next().is_some() {
+        return Err(EscapeError::MoreThanOneChar);
     }
+    Ok(res)
+}
 
-    pub fn prefix_noraw(self) -> &'static str {
-        match self {
-            Char | Str | RawStr => "",
-            Byte | ByteStr | RawByteStr => "b",
-            CStr | RawCStr => "c",
-        }
+/// Takes the contents of a byte literal (without quotes),
+/// and returns an unescaped byte or an error.
+pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
+    unescape_byte_iter(&mut src.chars())
+}
+
+fn unescape_byte_iter(chars: &mut Chars<'_>) -> Result<u8, EscapeError> {
+    let res = match chars.next().ok_or(EscapeError::ZeroChars)? {
+        '\\' => scan_escape_for_byte(chars),
+        '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
+        '\r' => Err(EscapeError::BareCarriageReturn),
+        c => c.try_into().map_err(|_| EscapeError::NonAsciiCharInByte),
+    }?;
+    if chars.next().is_some() {
+        return Err(EscapeError::MoreThanOneChar);
     }
+    Ok(res)
 }
 
-fn scan_escape<T: From<char> + From<u8>>(
-    chars: &mut Chars<'_>,
-    mode: Mode,
-) -> Result<T, EscapeError> {
+/// Scan an escape sequence for a char
+fn scan_escape_for_char(chars: &mut impl Iterator<Item = char>) -> Result<char, EscapeError> {
     // Previous character was '\\', unescape what follows.
-    let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
-        '"' => '"',
-        'n' => '\n',
-        'r' => '\r',
-        't' => '\t',
-        '\\' => '\\',
-        '\'' => '\'',
-        '0' => '\0',
+    let next = chars.next().ok_or(EscapeError::LoneSlash)?;
+    simple_escape(next).map(|b| b as char).or_else(|c| match c {
         'x' => {
-            // Parse hexadecimal character code.
-
-            let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-            let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
-
-            let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
-            let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+            let byte = hex_escape(chars)?;
+            if byte.is_ascii() { Ok(byte as char) } else { Err(EscapeError::OutOfRangeHexEscape) }
+        }
+        'u' => {
+            let value = unicode_escape(chars)?;
+            if value > char::MAX as u32 {
+                Err(EscapeError::OutOfRangeUnicodeEscape)
+            } else {
+                char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
+            }
+        }
+        _ => Err(EscapeError::InvalidEscape),
+    })
+}
 
-            let value = (hi * 16 + lo) as u8;
+/// Scan an escape sequence for a byte
+fn scan_escape_for_byte(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
+    // Previous character was '\\', unescape what follows.
+    let next = chars.next().ok_or(EscapeError::LoneSlash)?;
+    simple_escape(next).or_else(|c| match c {
+        'x' => hex_escape(chars),
+        'u' => {
+            let _ = unicode_escape(chars)?;
+            Err(EscapeError::UnicodeEscapeInByte)
+        }
+        _ => Err(EscapeError::InvalidEscape),
+    })
+}
 
-            return if !mode.allow_high_bytes() && !value.is_ascii() {
-                Err(EscapeError::OutOfRangeHexEscape)
+fn scan_escape_for_cstr(chars: &mut impl Iterator<Item = char>) -> Result<MixedUnit, EscapeError> {
+    // Previous character was '\\', unescape what follows.
+    let next = chars.next().ok_or(EscapeError::LoneSlash)?;
+    simple_escape(next).map(MixedUnit::from).or_else(|c| match c {
+        'x' => Ok(MixedUnit::from(hex_escape(chars)?)),
+        'u' => {
+            let value = unicode_escape(chars)?;
+            if value > char::MAX as u32 {
+                Err(EscapeError::OutOfRangeUnicodeEscape)
             } else {
-                // This may be a high byte, but that will only happen if `T` is
-                // `MixedUnit`, because of the `allow_high_bytes` check above.
-                Ok(T::from(value))
-            };
+                char::from_u32(value)
+                    .map(MixedUnit::Char)
+                    .ok_or(EscapeError::LoneSurrogateUnicodeEscape)
+            }
         }
-        'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
-        _ => return Err(EscapeError::InvalidEscape),
-    };
-    Ok(T::from(res))
+
+        _ => Err(EscapeError::InvalidEscape),
+    })
+}
+
+/// Parse the character of an ASCII escape without the leading backslash.
+fn simple_escape(c: char) -> Result<u8, char> {
+    // Previous character was '\\', unescape what follows.
+    Ok(match c {
+        '"' => b'"',
+        'n' => b'\n',
+        'r' => b'\r',
+        't' => b'\t',
+        '\\' => b'\\',
+        '\'' => b'\'',
+        '0' => b'\0',
+        _ => Err(c)?,
+    })
 }
 
-fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
-    // We've parsed '\u', now we have to parse '{..}'.
+/// Parse the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
+fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
+    let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
+
+    let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
+    let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 
+    Ok((hi * 16 + lo) as u8)
+}
+
+/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
+/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
+fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
     if chars.next() != Some('{') {
         return Err(EscapeError::NoBraceInUnicodeEscape);
     }
@@ -290,23 +410,13 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
             None => return Err(EscapeError::UnclosedUnicodeEscape),
             Some('_') => continue,
             Some('}') => {
-                if n_digits > 6 {
-                    return Err(EscapeError::OverlongUnicodeEscape);
-                }
-
                 // Incorrect syntax has higher priority for error reporting
                 // than unallowed value for a literal.
-                if !allow_unicode_escapes {
-                    return Err(EscapeError::UnicodeEscapeInByte);
-                }
-
-                break std::char::from_u32(value).ok_or({
-                    if value > 0x10FFFF {
-                        EscapeError::OutOfRangeUnicodeEscape
-                    } else {
-                        EscapeError::LoneSurrogateUnicodeEscape
-                    }
-                });
+                return if n_digits > 6 {
+                    Err(EscapeError::OverlongUnicodeEscape)
+                } else {
+                    Ok(value)
+                };
             }
             Some(c) => {
                 let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
@@ -321,118 +431,67 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
     }
 }
 
-#[inline]
-fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
-    if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
-}
-
-fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
-    let c = chars.next().ok_or(EscapeError::ZeroChars)?;
-    let res = match c {
-        '\\' => scan_escape(chars, mode),
-        '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
-        '\r' => Err(EscapeError::BareCarriageReturn),
-        _ => ascii_check(c, mode.allow_unicode_chars()),
-    }?;
-    if chars.next().is_some() {
-        return Err(EscapeError::MoreThanOneChar);
-    }
-    Ok(res)
-}
-
-/// Takes a contents of a string literal (without quotes) and produces a
-/// sequence of escaped characters or errors.
-fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
+/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without quotes)
+/// and produces a sequence of unescaped characters or errors,
+/// which are returned by invoking `callback`.
+///
+/// For `Char` and `Byte` modes, the callback will be called exactly once.
+pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
 where
-    F: FnMut(Range<usize>, Result<T, EscapeError>),
+    F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
-    let mut chars = src.chars();
-    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
-
-    // The `start` and `end` computation here is complicated because
-    // `skip_ascii_whitespace` makes us to skip over chars without counting
-    // them in the range computation.
-    while let Some(c) = chars.next() {
-        let start = src.len() - chars.as_str().len() - c.len_utf8();
-        let res = match c {
-            '\\' => {
-                match chars.clone().next() {
-                    Some('\n') => {
-                        // Rust language specification requires us to skip whitespaces
-                        // if unescaped '\' character is followed by '\n'.
-                        // For details see [Rust language reference]
-                        // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
-                        skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
-                            callback(range, Err(err))
-                        });
-                        continue;
-                    }
-                    _ => scan_escape::<T>(&mut chars, mode),
-                }
-            }
-            '"' => Err(EscapeError::EscapeOnlyChar),
-            '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, allow_unicode_chars).map(T::from),
-        };
-        let end = src.len() - chars.as_str().len();
-        callback(start..end, res);
+    let mut byte_callback =
+        |range, res: Result<u8, EscapeError>| callback(range, res.map(char::from));
+    match mode {
+        Char => {
+            let mut chars = src.chars();
+            let res = unescape_char_iter(&mut chars);
+            callback(0..(src.len() - chars.as_str().len()), res);
+        }
+        Byte => {
+            let mut chars = src.chars();
+            let res = unescape_byte_iter(&mut chars).map(char::from);
+            callback(0..(src.len() - chars.as_str().len()), res);
+        }
+        Str => unescape_str(src, callback),
+        ByteStr => unescape_byte_str(src, &mut byte_callback),
+        RawStr => check_raw_str(src, callback),
+        RawByteStr => check_raw_byte_str(src, &mut byte_callback),
+        RawCStr => check_raw_cstr(src, callback),
+        CStr => unreachable!(),
     }
 }
 
-fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
-where
-    F: FnMut(Range<usize>, EscapeError),
-{
-    let tail = chars.as_str();
-    let first_non_space = tail
-        .bytes()
-        .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
-        .unwrap_or(tail.len());
-    if tail[1..first_non_space].contains('\n') {
-        // The +1 accounts for the escaping slash.
-        let end = start + first_non_space + 1;
-        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
-    }
-    let tail = &tail[first_non_space..];
-    if let Some(c) = tail.chars().next() {
-        if c.is_whitespace() {
-            // For error reporting, we would like the span to contain the character that was not
-            // skipped. The +1 is necessary to account for the leading \ that started the escape.
-            let end = start + first_non_space + c.len_utf8() + 1;
-            callback(start..end, EscapeError::UnskippedWhitespaceWarning);
-        }
-    }
-    *chars = tail.chars();
+/// What kind of literal do we parse.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum Mode {
+    Char,
+
+    Byte,
+
+    Str,
+    RawStr,
+
+    ByteStr,
+    RawByteStr,
+
+    CStr,
+    RawCStr,
 }
 
-/// Takes a contents of a string literal (without quotes) and produces a
-/// sequence of characters or errors.
-/// NOTE: Raw strings do not perform any explicit character escaping, here we
-/// only produce errors on bare CR.
-fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
-where
-    F: FnMut(Range<usize>, Result<char, EscapeError>),
-{
-    let mut chars = src.chars();
-    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
-
-    // The `start` and `end` computation here matches the one in
-    // `unescape_non_raw_common` for consistency, even though this function
-    // doesn't have to worry about skipping any chars.
-    while let Some(c) = chars.next() {
-        let start = src.len() - chars.as_str().len() - c.len_utf8();
-        let res = match c {
-            '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            _ => ascii_check(c, allow_unicode_chars),
-        };
-        let end = src.len() - chars.as_str().len();
-        callback(start..end, res);
+impl Mode {
+    pub fn in_double_quotes(self) -> bool {
+        match self {
+            Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
+            Char | Byte => false,
+        }
     }
-}
 
-#[inline]
-pub fn byte_from_char(c: char) -> u8 {
-    let res = c as u32;
-    debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
-    res as u8
+    pub fn prefix_noraw(self) -> &'static str {
+        match self {
+            Char | Str | RawStr => "",
+            Byte | ByteStr | RawByteStr => "b",
+            CStr | RawCStr => "c",
+        }
+    }
 }
diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs
index 6fa7a150516b8..715037e606713 100644
--- a/compiler/rustc_lexer/src/unescape/tests.rs
+++ b/compiler/rustc_lexer/src/unescape/tests.rs
@@ -241,7 +241,11 @@ fn test_unescape_byte_str_good() {
         unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
             if let Ok(b) = &mut buf {
                 match c {
-                    Ok(c) => b.push(byte_from_char(c)),
+                    Ok(c) => {
+                        let c = c as u32;
+                        debug_assert!(c <= u8::MAX as u32, "guaranteed because of ByteStr");
+                        b.push(c as u8)
+                    }
                     Err(e) => buf = Err((range, e)),
                 }
             }
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 792e2cc26ef1c..9aadcf12c57c4 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -985,10 +985,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
         prefix_len: u32,
         postfix_len: u32,
     ) -> (token::LitKind, Symbol) {
-        self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
-            unescape::unescape_mixed(src, mode, &mut |span, result| {
-                callback(span, result.map(drop))
-            })
+        self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, _mode, callback| {
+            unescape::unescape_cstr(src, &mut |span, result| callback(span, result.map(drop)))
         })
     }
 }
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
index c97596d5097ec..24a64ca31dac4 100644
--- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
+++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -309,7 +309,11 @@ impl<'a> Converter<'a> {
                     let text = &self.res.text[self.offset + 1..][..len - 1];
                     let i = text.rfind('"').unwrap();
                     let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::Str);
+                    err = rustc_lexer::unescape::unescape_str(text, &mut |_, res| {
+                        if let Err(e) = res {
+                            error_message = error_to_diagnostic_message(e, Mode::Str);
+                        }
+                    });
                 }
                 STRING
             }
@@ -320,7 +324,11 @@ impl<'a> Converter<'a> {
                     let text = &self.res.text[self.offset + 2..][..len - 2];
                     let i = text.rfind('"').unwrap();
                     let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::ByteStr);
+                    err = rustc_lexer::unescape::unescape_byte_str(text, &mut |_, res| {
+                        if let Err(e) = res {
+                            error_message = error_to_diagnostic_message(e, Mode::ByteStr);
+                        }
+                    });
                 }
                 BYTE_STRING
             }
@@ -331,7 +339,11 @@ impl<'a> Converter<'a> {
                     let text = &self.res.text[self.offset + 2..][..len - 2];
                     let i = text.rfind('"').unwrap();
                     let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::CStr);
+                    err = rustc_lexer::unescape::unescape_cstr(text, &mut |_, res| {
+                        if let Err(e) = res {
+                            error_message = error_to_diagnostic_message(e, Mode::CStr);
+                        }
+                    });
                 }
                 C_STRING
             }
@@ -397,27 +409,3 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
         EscapeError::MultipleSkippedLinesWarning => "",
     }
 }
-
-fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str {
-    let mut error_message = "";
-    match mode {
-        Mode::CStr => {
-            rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| {
-                if let Err(e) = res {
-                    error_message = error_to_diagnostic_message(e, mode);
-                }
-            });
-        }
-        Mode::ByteStr | Mode::Str => {
-            rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| {
-                if let Err(e) = res {
-                    error_message = error_to_diagnostic_message(e, mode);
-                }
-            });
-        }
-        _ => {
-            // Other Modes are not supported yet or do not apply
-        }
-    }
-    error_message
-}
diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
index 7d5ca2704354d..92680afddf789 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
@@ -3,7 +3,8 @@
 use std::{borrow::Cow, num::ParseIntError};
 
 use rustc_lexer::unescape::{
-    unescape_byte, unescape_char, unescape_mixed, unescape_unicode, EscapeError, MixedUnit, Mode,
+    unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str, unescape_unicode,
+    EscapeError, MixedUnit, Mode,
 };
 use stdx::always;
 
@@ -218,7 +219,7 @@ impl ast::String {
         let mut buf = String::new();
         let mut prev_end = 0;
         let mut has_error = None;
-        unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
+        unescape_str(text, &mut |char_range, unescaped_char| match (
             unescaped_char,
             buf.capacity() == 0,
         ) {
@@ -259,18 +260,18 @@ impl ast::ByteString {
         let mut buf: Vec<u8> = Vec::new();
         let mut prev_end = 0;
         let mut has_error = None;
-        unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
-            unescaped_char,
+        unescape_byte_str(text, &mut |char_range, unescaped_byte| match (
+            unescaped_byte,
             buf.capacity() == 0,
         ) {
-            (Ok(c), false) => buf.push(c as u8),
+            (Ok(b), false) => buf.push(b),
             (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => {
                 prev_end = char_range.end
             }
-            (Ok(c), true) => {
+            (Ok(b), true) => {
                 buf.reserve_exact(text.len());
                 buf.extend_from_slice(text[..prev_end].as_bytes());
-                buf.push(c as u8);
+                buf.push(b);
             }
             (Err(e), _) => has_error = Some(e),
         });
@@ -297,7 +298,7 @@ impl IsString for ast::CString {
         let text = &self.text()[text_range_no_quotes - start];
         let offset = text_range_no_quotes.start() - start;
 
-        unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| {
+        unescape_cstr(text, &mut |range, unescaped_char| {
             let text_range =
                 TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
             // XXX: This method should only be used for highlighting ranges. The unescaped
@@ -323,10 +324,7 @@ impl ast::CString {
             MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
             MixedUnit::HighByte(b) => buf.push(b),
         };
-        unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match (
-            unescaped,
-            buf.capacity() == 0,
-        ) {
+        unescape_cstr(text, &mut |char_range, unescaped| match (unescaped, buf.capacity() == 0) {
             (Ok(u), false) => extend_unit(&mut buf, u),
             (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => {
                 prev_end = char_range.end
diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
index 13d352d3c691b..d14a39737995d 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
@@ -5,7 +5,9 @@
 mod block;
 
 use rowan::Direction;
-use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode};
+use rustc_lexer::unescape::{
+    self, unescape_byte, unescape_byte_str, unescape_char, unescape_cstr, unescape_str,
+};
 
 use crate::{
     algo,
@@ -139,7 +141,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::String(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 1, '"') {
-                    unescape_unicode(without_quotes, Mode::Str, &mut |range, char| {
+                    unescape_str(without_quotes, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -150,7 +152,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::ByteString(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 2, '"') {
-                    unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| {
+                    unescape_byte_str(without_quotes, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -161,7 +163,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::CString(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 2, '"') {
-                    unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| {
+                    unescape_cstr(without_quotes, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -171,7 +173,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         }
         ast::LiteralKind::Char(_) => {
             if let Some(without_quotes) = unquote(text, 1, '\'') {
-                unescape_unicode(without_quotes, Mode::Char, &mut |range, char| {
+                unescape_char(without_quotes, &mut |range, char| {
                     if let Err(err) = char {
                         push_err(1, range.start, err);
                     }
@@ -180,8 +182,8 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         }
         ast::LiteralKind::Byte(_) => {
             if let Some(without_quotes) = unquote(text, 2, '\'') {
-                unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| {
-                    if let Err(err) = char {
+                unescape_byte(without_quotes, &mut |range, byte| {
+                    if let Err(err) = byte {
                         push_err(2, range.start, err);
                     }
                 });