Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2151,6 +2151,13 @@ version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"

[[package]]
name = "literal-escaper"
version = "0.0.0"
dependencies = [
"rustc-std-workspace-std 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "lld-wrapper"
version = "0.1.0"
Expand Down Expand Up @@ -3328,6 +3335,12 @@ version = "1.0.1"
name = "rustc-std-workspace-std"
version = "1.0.1"

[[package]]
name = "rustc-std-workspace-std"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aba676a20abe46e5b0f1b0deae474aaaf31407e6c71147159890574599da04ef"

[[package]]
name = "rustc_abi"
version = "0.0.0"
Expand Down Expand Up @@ -3366,6 +3379,7 @@ name = "rustc_ast"
version = "0.0.0"
dependencies = [
"bitflags",
"literal-escaper",
"memchr",
"rustc_ast_ir",
"rustc_data_structures",
Expand Down Expand Up @@ -4060,6 +4074,7 @@ name = "rustc_lexer"
version = "0.0.0"
dependencies = [
"expect-test",
"literal-escaper",
"memchr",
"unicode-properties",
"unicode-xid",
Expand Down Expand Up @@ -4325,6 +4340,7 @@ name = "rustc_parse"
version = "0.0.0"
dependencies = [
"bitflags",
"literal-escaper",
"rustc_ast",
"rustc_ast_pretty",
"rustc_data_structures",
Expand All @@ -4347,6 +4363,7 @@ dependencies = [
name = "rustc_parse_format"
version = "0.0.0"
dependencies = [
"literal-escaper",
"rustc_index",
"rustc_lexer",
]
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_ast/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ edition = "2021"
[dependencies]
# tidy-alphabetical-start
bitflags = "2.4.1"
literal-escaper = { path = "../../library/literal-escaper" }
memchr = "2.7.4"
rustc_ast_ir = { path = "../rustc_ast_ir" }
rustc_data_structures = { path = "../rustc_data_structures" }
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use std::{ascii, fmt, str};

use rustc_lexer::unescape::{
use literal_escaper::{
MixedUnit, Mode, byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode,
};
use rustc_span::{Span, Symbol, kw, sym};
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Rust lexer used by rustc. No stability guarantees are provided.
[dependencies]
memchr = "2.7.4"
unicode-xid = "0.2.0"
literal-escaper = { path = "../../library/literal-escaper" }

[dependencies.unicode-properties]
version = "0.1.0"
Expand Down
4 changes: 3 additions & 1 deletion compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@
// tidy-alphabetical-end

mod cursor;
pub mod unescape;

#[cfg(test)]
mod tests;

// FIXME: This is needed for rust-analyzer. Remove this dependency once rust-analyzer uses
// `literal-escaper`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a sub-tree so you can just make the changes directly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now it's tricky because it requires to make literal_escaper a dependency of r-a. I'll check with r-a people directly after this is merged.

Copy link
Member

@Veykril Veykril Mar 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will break our auto publishing of rustc_lexer and I don't immediately see how to prevent that :/ That means we won't be able to sync our subtree after this change. Or well, I guess we just can't bump this crate until literal_escaper ships on stable. So maybe that's fine as long as no relevant changes happen to rustc_lexer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No wait, this will break rustc_lexer and rustc_parse_format_args in general if I see this right? Those crates need to compile on stable which they no longer will by depending on this new unstable standard library crate.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, so for the time being, I leave it as is. Let's come up with a solution later on. :)

pub use literal_escaper as unescape;
use unicode_properties::UnicodeEmoji;
pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;

Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_parse/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ edition = "2021"
[dependencies]
# tidy-alphabetical-start
bitflags = "2.4.1"
literal-escaper = { path = "../../library/literal-escaper" }
rustc_ast = { path = "../rustc_ast" }
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
rustc_data_structures = { path = "../rustc_data_structures" }
Expand Down
6 changes: 3 additions & 3 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use std::ops::Range;

use literal_escaper::{self, EscapeError, Mode};
use rustc_ast::ast::{self, AttrStyle};
use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
use rustc_ast::tokenstream::TokenStream;
use rustc_ast::util::unicode::contains_text_flow_control_chars;
use rustc_errors::codes::*;
use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
use rustc_session::lint::BuiltinLintDiag;
use rustc_session::lint::builtin::{
Expand Down Expand Up @@ -970,7 +970,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_unicode(src, mode, &mut |span, result| {
literal_escaper::unescape_unicode(src, mode, &mut |span, result| {
callback(span, result.map(drop))
})
})
Expand All @@ -986,7 +986,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_mixed(src, mode, &mut |span, result| {
literal_escaper::unescape_mixed(src, mode, &mut |span, result| {
callback(span, result.map(drop))
})
})
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
use std::iter::once;
use std::ops::Range;

use literal_escaper::{EscapeError, Mode};
use rustc_errors::{Applicability, DiagCtxtHandle, ErrorGuaranteed};
use rustc_lexer::unescape::{EscapeError, Mode};
use rustc_span::{BytePos, Span};
use tracing::debug;

Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_parse/src/parser/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use core::ops::{Bound, ControlFlow};
use ast::mut_visit::{self, MutVisitor};
use ast::token::IdentIsRaw;
use ast::{CoroutineKind, ForLoopKind, GenBlockKind, MatchKind, Pat, Path, PathSegment, Recovered};
use literal_escaper::unescape_char;
use rustc_ast::ptr::P;
use rustc_ast::token::{self, Delimiter, Token, TokenKind};
use rustc_ast::tokenstream::TokenTree;
Expand All @@ -21,7 +22,6 @@ use rustc_ast::{
use rustc_ast_pretty::pprust;
use rustc_data_structures::stack::ensure_sufficient_stack;
use rustc_errors::{Applicability, Diag, PResult, StashKey, Subdiagnostic};
use rustc_lexer::unescape::unescape_char;
use rustc_macros::Subdiagnostic;
use rustc_session::errors::{ExprParenthesesNeeded, report_lit_error};
use rustc_session::lint::BuiltinLintDiag;
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_parse_format/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ edition = "2021"

[dependencies]
# tidy-alphabetical-start
literal-escaper = { path = "../../library/literal-escaper" }
rustc_index = { path = "../rustc_index", default-features = false }
rustc_lexer = { path = "../rustc_lexer" }
# tidy-alphabetical-end
11 changes: 6 additions & 5 deletions compiler/rustc_parse_format/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
pub use Alignment::*;
pub use Count::*;
pub use Position::*;
use rustc_lexer::unescape;

// Note: copied from rustc_span
/// Range inside of a `Span` used for diagnostics when we only have access to relative positions.
Expand Down Expand Up @@ -1095,12 +1094,14 @@ fn find_width_map_from_snippet(
fn unescape_string(string: &str) -> Option<String> {
let mut buf = String::new();
let mut ok = true;
unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
match unescaped_char {
literal_escaper::unescape_unicode(
string,
literal_escaper::Mode::Str,
&mut |_, unescaped_char| match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => ok = false,
}
});
},
);

ok.then_some(buf)
}
Expand Down
8 changes: 8 additions & 0 deletions library/Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ dependencies = [
"rustc-std-workspace-core",
]

[[package]]
name = "literal-escaper"
version = "0.0.0"
dependencies = [
"rustc-std-workspace-std",
]

[[package]]
name = "memchr"
version = "2.7.4"
Expand Down Expand Up @@ -220,6 +227,7 @@ name = "proc_macro"
version = "0.0.0"
dependencies = [
"core",
"literal-escaper",
"std",
]

Expand Down
1 change: 1 addition & 0 deletions library/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ members = [
]

exclude = [
"literal-escaper",
# stdarch has its own Cargo workspace
"stdarch",
"windows_targets"
Expand Down
10 changes: 10 additions & 0 deletions library/literal-escaper/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "literal-escaper"
version = "0.0.0"
edition = "2021"

[dependencies]
std = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-std' }

[features]
rustc-dep-of-std = ["dep:std"]
4 changes: 4 additions & 0 deletions library/literal-escaper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# literal-escaper

This crate provides code to unescape string literals. It is used by `rustc_lexer`
and `proc_macro`.
1 change: 1 addition & 0 deletions library/proc_macro/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.0.0"
edition = "2021"

[dependencies]
literal-escaper = { path = "../literal-escaper", features = ["rustc-dep-of-std"] }
std = { path = "../std" }
# Workaround: when documenting this crate rustdoc will try to load crate named
# `core` when resolving doc links. Without this line a different `core` will be
Expand Down
115 changes: 115 additions & 0 deletions library/proc_macro/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#![feature(restricted_std)]
#![feature(rustc_attrs)]
#![feature(extend_one)]
#![feature(stmt_expr_attributes)]
#![recursion_limit = "256"]
#![allow(internal_features)]
#![deny(ffi_unwind_calls)]
Expand All @@ -50,11 +51,24 @@ use std::{error, fmt};

#[unstable(feature = "proc_macro_diagnostic", issue = "54140")]
pub use diagnostic::{Diagnostic, Level, MultiSpan};
#[unstable(feature = "proc_macro_value", issue = "136652")]
pub use literal_escaper::EscapeError;
use literal_escaper::{MixedUnit, Mode, byte_from_char, unescape_mixed, unescape_unicode};
#[unstable(feature = "proc_macro_totokens", issue = "130977")]
pub use to_tokens::ToTokens;

use crate::escape::{EscapeOptions, escape_bytes};

/// Errors returned when trying to retrieve a literal unescaped value.
#[unstable(feature = "proc_macro_value", issue = "136652")]
#[derive(Debug, PartialEq, Eq)]
pub enum ConversionErrorKind {
/// The literal failed to be escaped, take a look at [`EscapeError`] for more information.
FailedToUnescape(EscapeError),
/// Trying to convert a literal with the wrong type.
InvalidLiteralKind,
}

/// Determines whether proc_macro has been made accessible to the currently
/// running program.
///
Expand Down Expand Up @@ -1450,6 +1464,107 @@ impl Literal {
}
})
}

/// Returns the unescaped string value if the current literal is a string or a string literal.
#[unstable(feature = "proc_macro_value", issue = "136652")]
pub fn str_value(&self) -> Result<String, ConversionErrorKind> {
self.0.symbol.with(|symbol| match self.0.kind {
bridge::LitKind::Str => {
if symbol.contains('\\') {
let mut buf = String::with_capacity(symbol.len());
let mut error = None;
// Force-inlining here is aggressive but the closure is
// called on every char in the string, so it can be hot in
// programs with many long strings containing escapes.
unescape_unicode(
symbol,
Mode::Str,
&mut #[inline(always)]
|_, c| match c {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Some(ConversionErrorKind::FailedToUnescape(err));
}
}
},
);
if let Some(error) = error { Err(error) } else { Ok(buf) }
} else {
Ok(symbol.to_string())
}
}
bridge::LitKind::StrRaw(_) => Ok(symbol.to_string()),
_ => Err(ConversionErrorKind::InvalidLiteralKind),
})
}

/// Returns the unescaped string value if the current literal is a c-string or a c-string
/// literal.
#[unstable(feature = "proc_macro_value", issue = "136652")]
pub fn cstr_value(&self) -> Result<Vec<u8>, ConversionErrorKind> {
self.0.symbol.with(|symbol| match self.0.kind {
bridge::LitKind::CStr => {
let mut error = None;
let mut buf = Vec::with_capacity(symbol.len());

unescape_mixed(symbol, Mode::CStr, &mut |_span, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(err) => {
if err.is_fatal() {
error = Some(ConversionErrorKind::FailedToUnescape(err));
}
}
});
if let Some(error) = error {
Err(error)
} else {
buf.push(0);
Ok(buf)
}
}
bridge::LitKind::CStrRaw(_) => {
// Raw strings have no escapes so we can convert the symbol
// directly to a `Lrc<u8>` after appending the terminating NUL
// char.
let mut buf = symbol.to_owned().into_bytes();
buf.push(0);
Ok(buf)
}
_ => Err(ConversionErrorKind::InvalidLiteralKind),
})
}

/// Returns the unescaped string value if the current literal is a byte string or a byte string
/// literal.
#[unstable(feature = "proc_macro_value", issue = "136652")]
pub fn byte_str_value(&self) -> Result<Vec<u8>, ConversionErrorKind> {
self.0.symbol.with(|symbol| match self.0.kind {
bridge::LitKind::ByteStr => {
let mut buf = Vec::with_capacity(symbol.len());
let mut error = None;

unescape_unicode(symbol, Mode::ByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Some(ConversionErrorKind::FailedToUnescape(err));
}
}
});
if let Some(error) = error { Err(error) } else { Ok(buf) }
}
bridge::LitKind::ByteStrRaw(_) => {
// Raw strings have no escapes so we can convert the symbol
// directly to a `Lrc<u8>`.
Ok(symbol.to_owned().into_bytes())
}
_ => Err(ConversionErrorKind::InvalidLiteralKind),
})
}
}

/// Parse a single literal from its stringified representation.
Expand Down
Loading
Loading