Skip to content

Commit d5fab33

Browse files
committed
Auto merge of rust-lang#118484 - blyxxyz:os-str-slice, r=BurntSushi
Add substring API for `OsStr` This adds a method for taking a substring of an `OsStr`, which in combination with [`OsStr::as_encoded_bytes()`](https://doc.rust-lang.org/std/ffi/struct.OsStr.html#method.as_encoded_bytes) makes it possible to implement most string operations in safe code. API: ```rust impl OsStr { pub fn slice_encoded_bytes<R: ops::RangeBounds<usize>>(&self, range: R) -> &Self; } ``` Motivation, examples and research at rust-lang/libs-team#306. Tracking issue: rust-lang#118485 cc `@epage` r? libs-api
2 parents 3f1e30a + 729851e commit d5fab33

File tree

3 files changed

+131
-2
lines changed

3 files changed

+131
-2
lines changed

library/std/src/ffi/os_str.rs

+80-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ use crate::cmp;
66
use crate::collections::TryReserveError;
77
use crate::fmt;
88
use crate::hash::{Hash, Hasher};
9-
use crate::ops;
9+
use crate::ops::{self, Range};
1010
use crate::rc::Rc;
11-
use crate::str::FromStr;
11+
use crate::slice;
12+
use crate::str::{from_utf8 as str_from_utf8, FromStr};
1213
use crate::sync::Arc;
1314

1415
use crate::sys::os_str::{Buf, Slice};
@@ -963,6 +964,83 @@ impl OsStr {
963964
self.inner.as_encoded_bytes()
964965
}
965966

967+
/// Takes a substring based on a range that corresponds to the return value of
968+
/// [`OsStr::as_encoded_bytes`].
969+
///
970+
/// The range's start and end must lie on valid `OsStr` boundaries.
971+
/// A valid `OsStr` boundary is one of:
972+
/// - The start of the string
973+
/// - The end of the string
974+
/// - Immediately before a valid non-empty UTF-8 substring
975+
/// - Immediately after a valid non-empty UTF-8 substring
976+
///
977+
/// # Panics
978+
///
979+
/// Panics if `range` does not lie on valid `OsStr` boundaries or if it
980+
/// exceeds the end of the string.
981+
///
982+
/// # Example
983+
///
984+
/// ```
985+
/// #![feature(os_str_slice)]
986+
///
987+
/// use std::ffi::OsStr;
988+
///
989+
/// let os_str = OsStr::new("foo=bar");
990+
/// let bytes = os_str.as_encoded_bytes();
991+
/// if let Some(index) = bytes.iter().position(|b| *b == b'=') {
992+
/// let key = os_str.slice_encoded_bytes(..index);
993+
/// let value = os_str.slice_encoded_bytes(index + 1..);
994+
/// assert_eq!(key, "foo");
995+
/// assert_eq!(value, "bar");
996+
/// }
997+
/// ```
998+
#[unstable(feature = "os_str_slice", issue = "118485")]
999+
pub fn slice_encoded_bytes<R: ops::RangeBounds<usize>>(&self, range: R) -> &Self {
1000+
#[track_caller]
1001+
fn check_valid_boundary(bytes: &[u8], index: usize) {
1002+
if index == 0 || index == bytes.len() {
1003+
return;
1004+
}
1005+
1006+
// Fast path
1007+
if bytes[index - 1].is_ascii() || bytes[index].is_ascii() {
1008+
return;
1009+
}
1010+
1011+
let (before, after) = bytes.split_at(index);
1012+
1013+
// UTF-8 takes at most 4 bytes per codepoint, so we don't
1014+
// need to check more than that.
1015+
let after = after.get(..4).unwrap_or(after);
1016+
match str_from_utf8(after) {
1017+
Ok(_) => return,
1018+
Err(err) if err.valid_up_to() != 0 => return,
1019+
Err(_) => (),
1020+
}
1021+
1022+
for len in 2..=4.min(index) {
1023+
let before = &before[index - len..];
1024+
if str_from_utf8(before).is_ok() {
1025+
return;
1026+
}
1027+
}
1028+
1029+
panic!("byte index {index} is not an OsStr boundary");
1030+
}
1031+
1032+
let encoded_bytes = self.as_encoded_bytes();
1033+
let Range { start, end } = slice::range(range, ..encoded_bytes.len());
1034+
check_valid_boundary(encoded_bytes, start);
1035+
check_valid_boundary(encoded_bytes, end);
1036+
1037+
// SAFETY: `slice::range` ensures that `start` and `end` are valid
1038+
let slice = unsafe { encoded_bytes.get_unchecked(start..end) };
1039+
1040+
// SAFETY: `slice` comes from `self` and we validated the boundaries
1041+
unsafe { Self::from_encoded_bytes_unchecked(slice) }
1042+
}
1043+
9661044
/// Converts this string to its ASCII lower case equivalent in-place.
9671045
///
9681046
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',

library/std/src/ffi/os_str/tests.rs

+50
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,53 @@ fn into_rc() {
177177
assert_eq!(&*rc2, os_str);
178178
assert_eq!(&*arc2, os_str);
179179
}
180+
181+
#[test]
182+
fn slice_encoded_bytes() {
183+
let os_str = OsStr::new("123θგ🦀");
184+
// ASCII
185+
let digits = os_str.slice_encoded_bytes(..3);
186+
assert_eq!(digits, "123");
187+
let three = os_str.slice_encoded_bytes(2..3);
188+
assert_eq!(three, "3");
189+
// 2-byte UTF-8
190+
let theta = os_str.slice_encoded_bytes(3..5);
191+
assert_eq!(theta, "θ");
192+
// 3-byte UTF-8
193+
let gani = os_str.slice_encoded_bytes(5..8);
194+
assert_eq!(gani, "გ");
195+
// 4-byte UTF-8
196+
let crab = os_str.slice_encoded_bytes(8..);
197+
assert_eq!(crab, "🦀");
198+
}
199+
200+
#[test]
201+
#[should_panic(expected = "byte index 2 is not an OsStr boundary")]
202+
fn slice_mid_char() {
203+
let crab = OsStr::new("🦀");
204+
let _ = crab.slice_encoded_bytes(..2);
205+
}
206+
207+
#[cfg(windows)]
208+
#[test]
209+
#[should_panic(expected = "byte index 3 is not an OsStr boundary")]
210+
fn slice_between_surrogates() {
211+
use crate::os::windows::ffi::OsStringExt;
212+
213+
let os_string = OsString::from_wide(&[0xD800, 0xD800]);
214+
assert_eq!(os_string.as_encoded_bytes(), &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80]);
215+
let _ = os_string.slice_encoded_bytes(..3);
216+
}
217+
218+
#[cfg(windows)]
219+
#[test]
220+
fn slice_surrogate_edge() {
221+
use crate::os::windows::ffi::OsStringExt;
222+
223+
let os_string = OsString::from_wide(&[0xD800]);
224+
let mut with_crab = os_string.clone();
225+
with_crab.push("🦀");
226+
227+
assert_eq!(with_crab.slice_encoded_bytes(..3), os_string);
228+
assert_eq!(with_crab.slice_encoded_bytes(3..), "🦀");
229+
}

library/std/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@
341341
#![feature(round_ties_even)]
342342
#![feature(slice_internals)]
343343
#![feature(slice_ptr_get)]
344+
#![feature(slice_range)]
344345
#![feature(std_internals)]
345346
#![feature(str_internals)]
346347
#![feature(strict_provenance)]

0 commit comments

Comments
 (0)