Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add substring API for OsStr #118484

Merged
merged 1 commit into from
Dec 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 80 additions & 2 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ use crate::cmp;
use crate::collections::TryReserveError;
use crate::fmt;
use crate::hash::{Hash, Hasher};
use crate::ops;
use crate::ops::{self, Range};
use crate::rc::Rc;
use crate::str::FromStr;
use crate::slice;
use crate::str::{from_utf8 as str_from_utf8, FromStr};
use crate::sync::Arc;

use crate::sys::os_str::{Buf, Slice};
Expand Down Expand Up @@ -963,6 +964,83 @@ impl OsStr {
self.inner.as_encoded_bytes()
}

/// Takes a substring based on a range that corresponds to the return value of
/// [`OsStr::as_encoded_bytes`].
///
/// The range's start and end must lie on valid `OsStr` boundaries.
/// A valid `OsStr` boundary is one of:
/// - The start of the string
/// - The end of the string
/// - Immediately before a valid non-empty UTF-8 substring
/// - Immediately after a valid non-empty UTF-8 substring
///
/// # Panics
///
/// Panics if `range` does not lie on valid `OsStr` boundaries or if it
/// exceeds the end of the string.
///
/// # Example
///
/// ```
/// #![feature(os_str_slice)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("foo=bar");
/// let bytes = os_str.as_encoded_bytes();
/// if let Some(index) = bytes.iter().position(|b| *b == b'=') {
/// let key = os_str.slice_encoded_bytes(..index);
/// let value = os_str.slice_encoded_bytes(index + 1..);
/// assert_eq!(key, "foo");
/// assert_eq!(value, "bar");
/// }
/// ```
#[unstable(feature = "os_str_slice", issue = "118485")]
pub fn slice_encoded_bytes<R: ops::RangeBounds<usize>>(&self, range: R) -> &Self {
#[track_caller]
fn check_valid_boundary(bytes: &[u8], index: usize) {
if index == 0 || index == bytes.len() {
return;
}

// Fast path
if bytes[index - 1].is_ascii() || bytes[index].is_ascii() {
return;
}

let (before, after) = bytes.split_at(index);

// UTF-8 takes at most 4 bytes per codepoint, so we don't
// need to check more than that.
let after = after.get(..4).unwrap_or(after);
match str_from_utf8(after) {
Ok(_) => return,
Err(err) if err.valid_up_to() != 0 => return,
Err(_) => (),
}

for len in 2..=4.min(index) {
let before = &before[index - len..];
if str_from_utf8(before).is_ok() {
return;
}
}

panic!("byte index {index} is not an OsStr boundary");
}

let encoded_bytes = self.as_encoded_bytes();
let Range { start, end } = slice::range(range, ..encoded_bytes.len());
check_valid_boundary(encoded_bytes, start);
check_valid_boundary(encoded_bytes, end);

// SAFETY: `slice::range` ensures that `start` and `end` are valid
let slice = unsafe { encoded_bytes.get_unchecked(start..end) };

// SAFETY: `slice` comes from `self` and we validated the boundaries
unsafe { Self::from_encoded_bytes_unchecked(slice) }
}

/// Converts this string to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
Expand Down
50 changes: 50 additions & 0 deletions library/std/src/ffi/os_str/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,53 @@ fn into_rc() {
assert_eq!(&*rc2, os_str);
assert_eq!(&*arc2, os_str);
}

#[test]
fn slice_encoded_bytes() {
let os_str = OsStr::new("123θგ🦀");
// ASCII
let digits = os_str.slice_encoded_bytes(..3);
assert_eq!(digits, "123");
let three = os_str.slice_encoded_bytes(2..3);
assert_eq!(three, "3");
// 2-byte UTF-8
let theta = os_str.slice_encoded_bytes(3..5);
assert_eq!(theta, "θ");
// 3-byte UTF-8
let gani = os_str.slice_encoded_bytes(5..8);
assert_eq!(gani, "გ");
// 4-byte UTF-8
let crab = os_str.slice_encoded_bytes(8..);
assert_eq!(crab, "🦀");
}

#[test]
#[should_panic(expected = "byte index 2 is not an OsStr boundary")]
fn slice_mid_char() {
let crab = OsStr::new("🦀");
let _ = crab.slice_encoded_bytes(..2);
}

#[cfg(windows)]
#[test]
#[should_panic(expected = "byte index 3 is not an OsStr boundary")]
fn slice_between_surrogates() {
use crate::os::windows::ffi::OsStringExt;

let os_string = OsString::from_wide(&[0xD800, 0xD800]);
assert_eq!(os_string.as_encoded_bytes(), &[0xED, 0xA0, 0x80, 0xED, 0xA0, 0x80]);
let _ = os_string.slice_encoded_bytes(..3);
}

#[cfg(windows)]
#[test]
fn slice_surrogate_edge() {
use crate::os::windows::ffi::OsStringExt;

let os_string = OsString::from_wide(&[0xD800]);
let mut with_crab = os_string.clone();
with_crab.push("🦀");

assert_eq!(with_crab.slice_encoded_bytes(..3), os_string);
assert_eq!(with_crab.slice_encoded_bytes(3..), "🦀");
}
1 change: 1 addition & 0 deletions library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@
#![feature(round_ties_even)]
#![feature(slice_internals)]
#![feature(slice_ptr_get)]
#![feature(slice_range)]
#![feature(std_internals)]
#![feature(str_internals)]
#![feature(strict_provenance)]
Expand Down
Loading