Skip to content

Commit e898e7c

Browse files
authored
Rollup merge of rust-lang#109698 - epage:wtf, r=Amanieu
Allow limited access to `OsStr` bytes `OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to rust-lang#95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8). Tracking issue: rust-lang#111544
2 parents a9251b6 + e6a35c4 commit e898e7c

File tree

14 files changed

+159
-83
lines changed

14 files changed

+159
-83
lines changed

library/std/src/ffi/mod.rs

+8
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@
127127
//! trait, which provides a [`from_wide`] method to convert a native Windows
128128
//! string (without the terminating nul character) to an [`OsString`].
129129
//!
130+
//! ## On all platforms
131+
//!
132+
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
133+
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
134+
//!
135+
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
136+
//! [`OsStr::from_os_str_bytes_unchecked`].
137+
//!
130138
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
131139
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
132140
//! [`env::set_var()`]: crate::env::set_var "env::set_var"

library/std/src/ffi/os_str.rs

+69-13
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,51 @@ impl OsStr {
667667
s.as_ref()
668668
}
669669

670+
/// Converts a slice of bytes to an OS string slice without checking that the string contains
671+
/// valid `OsStr`-encoded data.
672+
///
673+
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
674+
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
675+
/// ASCII.
676+
///
677+
/// See the [module's toplevel documentation about conversions][conversions] for safe,
678+
/// cross-platform [conversions] from/to native representations.
679+
///
680+
/// # Safety
681+
///
682+
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
683+
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
684+
/// built for the same target platform. For example, reconstructing an `OsStr` from bytes sent
685+
/// over the network or stored in a file will likely violate these safety rules.
686+
///
687+
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
688+
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
689+
///
690+
/// # Example
691+
///
692+
/// ```
693+
/// #![feature(os_str_bytes)]
694+
///
695+
/// use std::ffi::OsStr;
696+
///
697+
/// let os_str = OsStr::new("Mary had a little lamb");
698+
/// let bytes = os_str.as_os_str_bytes();
699+
/// let words = bytes.split(|b| *b == b' ');
700+
/// let words: Vec<&OsStr> = words.map(|word| {
701+
/// // SAFETY:
702+
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
703+
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
704+
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
705+
/// }).collect();
706+
/// ```
707+
///
708+
/// [conversions]: super#conversions
709+
#[inline]
710+
#[unstable(feature = "os_str_bytes", issue = "111544")]
711+
pub unsafe fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
712+
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
713+
}
714+
670715
#[inline]
671716
fn from_inner(inner: &Slice) -> &OsStr {
672717
// SAFETY: OsStr is just a wrapper of Slice,
@@ -837,13 +882,24 @@ impl OsStr {
837882
OsString { inner: Buf::from_box(boxed) }
838883
}
839884

840-
/// Gets the underlying byte representation.
885+
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
886+
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
887+
///
888+
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
889+
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
890+
/// ASCII.
891+
///
892+
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
893+
/// be treated as opaque and only comparable within the same rust version built for the same
894+
/// target platform. For example, sending the slice over the network or storing it in a file
895+
/// will likely result in incompatible byte slices. See [`OsString`] for more encoding details
896+
/// and [`std::ffi`] for platform-specific, specified conversions.
841897
///
842-
/// Note: it is *crucial* that this API is not externally public, to avoid
843-
/// revealing the internal, platform-specific encodings.
898+
/// [`std::ffi`]: crate::ffi
844899
#[inline]
845-
pub(crate) fn bytes(&self) -> &[u8] {
846-
unsafe { &*(&self.inner as *const _ as *const [u8]) }
900+
#[unstable(feature = "os_str_bytes", issue = "111544")]
901+
pub fn as_os_str_bytes(&self) -> &[u8] {
902+
self.inner.as_os_str_bytes()
847903
}
848904

849905
/// Converts this string to its ASCII lower case equivalent in-place.
@@ -1131,7 +1187,7 @@ impl Default for &OsStr {
11311187
impl PartialEq for OsStr {
11321188
#[inline]
11331189
fn eq(&self, other: &OsStr) -> bool {
1134-
self.bytes().eq(other.bytes())
1190+
self.as_os_str_bytes().eq(other.as_os_str_bytes())
11351191
}
11361192
}
11371193

@@ -1158,23 +1214,23 @@ impl Eq for OsStr {}
11581214
impl PartialOrd for OsStr {
11591215
#[inline]
11601216
fn partial_cmp(&self, other: &OsStr) -> Option<cmp::Ordering> {
1161-
self.bytes().partial_cmp(other.bytes())
1217+
self.as_os_str_bytes().partial_cmp(other.as_os_str_bytes())
11621218
}
11631219
#[inline]
11641220
fn lt(&self, other: &OsStr) -> bool {
1165-
self.bytes().lt(other.bytes())
1221+
self.as_os_str_bytes().lt(other.as_os_str_bytes())
11661222
}
11671223
#[inline]
11681224
fn le(&self, other: &OsStr) -> bool {
1169-
self.bytes().le(other.bytes())
1225+
self.as_os_str_bytes().le(other.as_os_str_bytes())
11701226
}
11711227
#[inline]
11721228
fn gt(&self, other: &OsStr) -> bool {
1173-
self.bytes().gt(other.bytes())
1229+
self.as_os_str_bytes().gt(other.as_os_str_bytes())
11741230
}
11751231
#[inline]
11761232
fn ge(&self, other: &OsStr) -> bool {
1177-
self.bytes().ge(other.bytes())
1233+
self.as_os_str_bytes().ge(other.as_os_str_bytes())
11781234
}
11791235
}
11801236

@@ -1193,7 +1249,7 @@ impl PartialOrd<str> for OsStr {
11931249
impl Ord for OsStr {
11941250
#[inline]
11951251
fn cmp(&self, other: &OsStr) -> cmp::Ordering {
1196-
self.bytes().cmp(other.bytes())
1252+
self.as_os_str_bytes().cmp(other.as_os_str_bytes())
11971253
}
11981254
}
11991255

@@ -1243,7 +1299,7 @@ impl_cmp!(Cow<'a, OsStr>, OsString);
12431299
impl Hash for OsStr {
12441300
#[inline]
12451301
fn hash<H: Hasher>(&self, state: &mut H) {
1246-
self.bytes().hash(state)
1302+
self.as_os_str_bytes().hash(state)
12471303
}
12481304
}
12491305

library/std/src/path.rs

+24-28
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ impl<'a> Prefix<'a> {
193193
fn len(&self) -> usize {
194194
use self::Prefix::*;
195195
fn os_str_len(s: &OsStr) -> usize {
196-
s.bytes().len()
196+
s.as_os_str_bytes().len()
197197
}
198198
match *self {
199199
Verbatim(x) => 4 + os_str_len(x),
@@ -299,20 +299,6 @@ where
299299
}
300300
}
301301

302-
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
303-
// SAFETY: See note at the top of this module to understand why this and
304-
// `OsStr::bytes` are used:
305-
//
306-
// This casts are safe as OsStr is internally a wrapper around [u8] on all
307-
// platforms.
308-
//
309-
// Note that currently this relies on the special knowledge that std has;
310-
// these types are single-element structs but are not marked
311-
// repr(transparent) or repr(C) which would make these casts not allowable
312-
// outside std.
313-
unsafe { &*(s as *const [u8] as *const OsStr) }
314-
}
315-
316302
// Detect scheme on Redox
317303
fn has_redox_scheme(s: &[u8]) -> bool {
318304
cfg!(target_os = "redox") && s.contains(&b':')
@@ -330,26 +316,31 @@ fn has_physical_root(s: &[u8], prefix: Option<Prefix<'_>>) -> bool {
330316

331317
// basic workhorse for splitting stem and extension
332318
fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
333-
if file.bytes() == b".." {
319+
if file.as_os_str_bytes() == b".." {
334320
return (Some(file), None);
335321
}
336322

337323
// The unsafety here stems from converting between &OsStr and &[u8]
338324
// and back. This is safe to do because (1) we only look at ASCII
339325
// contents of the encoding and (2) new &OsStr values are produced
340326
// only from ASCII-bounded slices of existing &OsStr values.
341-
let mut iter = file.bytes().rsplitn(2, |b| *b == b'.');
327+
let mut iter = file.as_os_str_bytes().rsplitn(2, |b| *b == b'.');
342328
let after = iter.next();
343329
let before = iter.next();
344330
if before == Some(b"") {
345331
(Some(file), None)
346332
} else {
347-
unsafe { (before.map(|s| u8_slice_as_os_str(s)), after.map(|s| u8_slice_as_os_str(s))) }
333+
unsafe {
334+
(
335+
before.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
336+
after.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
337+
)
338+
}
348339
}
349340
}
350341

351342
fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
352-
let slice = file.bytes();
343+
let slice = file.as_os_str_bytes();
353344
if slice == b".." {
354345
return (file, None);
355346
}
@@ -364,7 +355,12 @@ fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
364355
};
365356
let before = &slice[..i];
366357
let after = &slice[i + 1..];
367-
unsafe { (u8_slice_as_os_str(before), Some(u8_slice_as_os_str(after))) }
358+
unsafe {
359+
(
360+
OsStr::from_os_str_bytes_unchecked(before),
361+
Some(OsStr::from_os_str_bytes_unchecked(after)),
362+
)
363+
}
368364
}
369365

370366
////////////////////////////////////////////////////////////////////////////////
@@ -743,7 +739,7 @@ impl<'a> Components<'a> {
743739
// separately via `include_cur_dir`
744740
b".." => Some(Component::ParentDir),
745741
b"" => None,
746-
_ => Some(Component::Normal(unsafe { u8_slice_as_os_str(comp) })),
742+
_ => Some(Component::Normal(unsafe { OsStr::from_os_str_bytes_unchecked(comp) })),
747743
}
748744
}
749745

@@ -900,7 +896,7 @@ impl<'a> Iterator for Components<'a> {
900896
let raw = &self.path[..self.prefix_len()];
901897
self.path = &self.path[self.prefix_len()..];
902898
return Some(Component::Prefix(PrefixComponent {
903-
raw: unsafe { u8_slice_as_os_str(raw) },
899+
raw: unsafe { OsStr::from_os_str_bytes_unchecked(raw) },
904900
parsed: self.prefix.unwrap(),
905901
}));
906902
}
@@ -972,7 +968,7 @@ impl<'a> DoubleEndedIterator for Components<'a> {
972968
State::Prefix if self.prefix_len() > 0 => {
973969
self.back = State::Done;
974970
return Some(Component::Prefix(PrefixComponent {
975-
raw: unsafe { u8_slice_as_os_str(self.path) },
971+
raw: unsafe { OsStr::from_os_str_bytes_unchecked(self.path) },
976972
parsed: self.prefix.unwrap(),
977973
}));
978974
}
@@ -1481,17 +1477,17 @@ impl PathBuf {
14811477
fn _set_extension(&mut self, extension: &OsStr) -> bool {
14821478
let file_stem = match self.file_stem() {
14831479
None => return false,
1484-
Some(f) => f.bytes(),
1480+
Some(f) => f.as_os_str_bytes(),
14851481
};
14861482

14871483
// truncate until right after the file stem
14881484
let end_file_stem = file_stem[file_stem.len()..].as_ptr().addr();
1489-
let start = self.inner.bytes().as_ptr().addr();
1485+
let start = self.inner.as_os_str_bytes().as_ptr().addr();
14901486
let v = self.as_mut_vec();
14911487
v.truncate(end_file_stem.wrapping_sub(start));
14921488

14931489
// add the new extension, if any
1494-
let new = extension.bytes();
1490+
let new = extension.as_os_str_bytes();
14951491
if !new.is_empty() {
14961492
v.reserve_exact(new.len() + 1);
14971493
v.push(b'.');
@@ -2011,11 +2007,11 @@ impl Path {
20112007
// The following (private!) function allows construction of a path from a u8
20122008
// slice, which is only safe when it is known to follow the OsStr encoding.
20132009
unsafe fn from_u8_slice(s: &[u8]) -> &Path {
2014-
unsafe { Path::new(u8_slice_as_os_str(s)) }
2010+
unsafe { Path::new(OsStr::from_os_str_bytes_unchecked(s)) }
20152011
}
20162012
// The following (private!) function reveals the byte encoding used for OsStr.
20172013
fn as_u8_slice(&self) -> &[u8] {
2018-
self.inner.bytes()
2014+
self.inner.as_os_str_bytes()
20192015
}
20202016

20212017
/// Directly wraps a string slice as a `Path` slice.

library/std/src/sys/common/small_c_string.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub fn run_path_with_cstr<T, F>(path: &Path, f: F) -> io::Result<T>
1919
where
2020
F: FnOnce(&CStr) -> io::Result<T>,
2121
{
22-
run_with_cstr(path.as_os_str().bytes(), f)
22+
run_with_cstr(path.as_os_str().as_os_str_bytes(), f)
2323
}
2424

2525
#[inline]

library/std/src/sys/common/tests.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use core::iter::repeat;
88
fn stack_allocation_works() {
99
let path = Path::new("abc");
1010
let result = run_path_with_cstr(path, |p| {
11-
assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
11+
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
1212
Ok(42)
1313
});
1414
assert_eq!(result.unwrap(), 42);
@@ -25,7 +25,7 @@ fn heap_allocation_works() {
2525
let path = repeat("a").take(384).collect::<String>();
2626
let path = Path::new(&path);
2727
let result = run_path_with_cstr(path, |p| {
28-
assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
28+
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
2929
Ok(42)
3030
});
3131
assert_eq!(result.unwrap(), 42);

library/std/src/sys/unix/os_str.rs

+7-2
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,18 @@ impl Buf {
193193

194194
impl Slice {
195195
#[inline]
196-
fn from_u8_slice(s: &[u8]) -> &Slice {
196+
pub fn as_os_str_bytes(&self) -> &[u8] {
197+
&self.inner
198+
}
199+
200+
#[inline]
201+
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
197202
unsafe { mem::transmute(s) }
198203
}
199204

200205
#[inline]
201206
pub fn from_str(s: &str) -> &Slice {
202-
Slice::from_u8_slice(s.as_bytes())
207+
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
203208
}
204209

205210
pub fn to_str(&self) -> Option<&str> {

library/std/src/sys/unix/os_str/tests.rs

+4-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use super::*;
22

33
#[test]
44
fn slice_debug_output() {
5-
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
5+
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
66
let expected = r#""\xF0hello,\tworld""#;
77
let output = format!("{input:?}");
88

@@ -11,8 +11,7 @@ fn slice_debug_output() {
1111

1212
#[test]
1313
fn display() {
14-
assert_eq!(
15-
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
16-
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
17-
);
14+
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
15+
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
16+
},);
1817
}

library/std/src/sys/unix/path.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
3030

3131
// Get the components, skipping the redundant leading "." component if it exists.
3232
let mut components = path.strip_prefix(".").unwrap_or(path).components();
33-
let path_os = path.as_os_str().bytes();
33+
let path_os = path.as_os_str().as_os_str_bytes();
3434

3535
let mut normalized = if path.is_absolute() {
3636
// "If a pathname begins with two successive <slash> characters, the

library/std/src/sys/unix/process/process_common.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,9 @@ pub enum ProgramKind {
164164

165165
impl ProgramKind {
166166
fn new(program: &OsStr) -> Self {
167-
if program.bytes().starts_with(b"/") {
167+
if program.as_os_str_bytes().starts_with(b"/") {
168168
Self::Absolute
169-
} else if program.bytes().contains(&b'/') {
169+
} else if program.as_os_str_bytes().contains(&b'/') {
170170
// If the program has more than one component in it, it is a relative path.
171171
Self::Relative
172172
} else {

0 commit comments

Comments
 (0)