Skip to content

Commit

Permalink
Add OsStr::to_str_split() and OsString::into_string_split().
Browse files Browse the repository at this point in the history
  • Loading branch information
jmillikin committed May 1, 2023
1 parent 9ecda8d commit 0f93dae
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 0 deletions.
56 changes: 56 additions & 0 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,34 @@ impl OsString {
self.inner.into_string().map_err(|buf| OsString { inner: buf })
}

/// Splits the `OsString` into a Unicode prefix and non-Unicode suffix.
///
/// The returned `String` is the longest prefix of the `OsString` that
/// contained valid Unicode. The returned `OsString` is the rest of the
/// original value.
///
/// # Examples
///
/// ```
/// #![feature(osstr_str_prefix_ops)]
///
/// use std::ffi::OsString;
///
/// let os_string = OsString::from("foo");
/// let (prefix, suffix) = os_string.clone().into_string_split();
///
/// let mut rejoined = OsString::from(prefix);
/// rejoined.push(suffix);
/// assert_eq!(rejoined, os_string);
/// ```
#[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
#[must_use]
#[inline]
pub fn into_string_split(self) -> (String, OsString) {
let (prefix, suffix) = self.inner.into_string_split();
(prefix, OsString { inner: suffix })
}

/// Extends the string with the given <code>&[OsStr]</code> slice.
///
/// # Examples
Expand Down Expand Up @@ -703,6 +731,34 @@ impl OsStr {
self.inner.to_str()
}

/// Splits the `OsStr` into a Unicode prefix and non-Unicode suffix.
///
/// The returned `str` is the longest prefix of the `OsStr` that
/// contained valid Unicode. The returned `OsStr` is the rest of the
/// original value.
///
/// # Examples
///
/// ```
/// #![feature(osstr_str_prefix_ops)]
///
/// use std::ffi::{OsStr, OsString};
///
/// let os_str = OsStr::new("foo");
/// let (prefix, suffix) = os_str.to_str_split();
///
/// let mut rejoined = OsString::from(prefix);
/// rejoined.push(suffix);
/// assert_eq!(rejoined, os_str);
/// ```
#[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
#[must_use]
#[inline]
pub fn to_str_split(&self) -> (&str, &OsStr) {
let (prefix, suffix) = self.inner.to_str_split();
(prefix, Self::from_inner(suffix))
}

/// Converts an `OsStr` to a <code>[Cow]<[str]></code>.
///
/// Any non-Unicode sequences are replaced with
Expand Down
36 changes: 36 additions & 0 deletions library/std/src/sys/unix/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,27 @@ impl Buf {
String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
}

pub fn into_string_split(self) -> (String, Buf) {
let utf8_err = match str::from_utf8(&self.inner) {
Ok(_) => {
// SAFETY: If `str::from_utf8()` succeeds then the input is UTF-8.
let prefix = unsafe { String::from_utf8_unchecked(self.inner) };
return (prefix, Buf { inner: Vec::new() });
}
Err(err) => err,
};
let utf8_len = utf8_err.valid_up_to();
if utf8_len == 0 {
return (String::new(), self);
}
let mut utf8_bytes = self.inner;
let rem_bytes = utf8_bytes.split_off(utf8_len);
// SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
// valid UTF-8 has been verified.
let prefix = unsafe { String::from_utf8_unchecked(utf8_bytes) };
(prefix, Buf { inner: rem_bytes })
}

pub fn push_slice(&mut self, s: &Slice) {
self.inner.extend_from_slice(&s.inner)
}
Expand Down Expand Up @@ -205,6 +226,21 @@ impl Slice {
str::from_utf8(&self.inner).ok()
}

pub fn to_str_split(&self) -> (&str, &Slice) {
let utf8_err = match str::from_utf8(&self.inner) {
Ok(prefix) => return (prefix, Slice::from_u8_slice(b"")),
Err(err) => err,
};
let utf8_len = utf8_err.valid_up_to();
if utf8_len == 0 {
return ("", self);
}
// SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
// valid UTF-8 has been verified.
let prefix = unsafe { str::from_utf8_unchecked(&self.inner[..utf8_len]) };
(prefix, Slice::from_u8_slice(&self.inner[utf8_len..]))
}

pub fn to_string_lossy(&self) -> Cow<'_, str> {
String::from_utf8_lossy(&self.inner)
}
Expand Down
34 changes: 34 additions & 0 deletions library/std/src/sys/unix/os_str/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,37 @@ fn display() {
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
);
}

#[test]
fn buf_into_string_split() {
let mut string = Buf::from_string(String::from("héllô wørld"));
{
let (prefix, suffix) = string.clone().into_string_split();
assert_eq!(prefix, String::from("héllô wørld"));
assert_eq!(suffix.into_inner(), Vec::new());
}

string.push_slice(Slice::from_u8_slice(b"\xFF"));
{
let (prefix, suffix) = string.clone().into_string_split();
assert_eq!(prefix, String::from("héllô wørld"));
assert_eq!(suffix.into_inner(), vec![0xFF]);
}
}

#[test]
fn slice_to_str_split() {
let mut string = Buf::from_string(String::from("héllô wørld"));
{
let (prefix, suffix) = string.as_slice().to_str_split();
assert_eq!(prefix, "héllô wørld");
assert_eq!(&suffix.inner, b"");
}

string.push_slice(Slice::from_u8_slice(b"\xFF"));
{
let (prefix, suffix) = string.as_slice().to_str_split();
assert_eq!(prefix, String::from("héllô wørld"));
assert_eq!(&suffix.inner, b"\xFF");
}
}
10 changes: 10 additions & 0 deletions library/std/src/sys/windows/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ impl Buf {
self.inner.into_string().map_err(|buf| Buf { inner: buf })
}

pub fn into_string_split(self) -> (String, Buf) {
let (prefix, suffix) = self.inner.into_string_split();
(prefix, Buf { inner: suffix })
}

pub fn push_slice(&mut self, s: &Slice) {
self.inner.push_wtf8(&s.inner)
}
Expand Down Expand Up @@ -159,6 +164,11 @@ impl Slice {
self.inner.as_str()
}

pub fn to_str_split(&self) -> (&str, &Slice) {
let (prefix, suffix) = self.inner.to_str_split();
(prefix, Slice { inner: suffix })
}

pub fn to_string_lossy(&self) -> Cow<'_, str> {
self.inner.to_string_lossy()
}
Expand Down
68 changes: 68 additions & 0 deletions library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,42 @@ impl Wtf8Buf {
}
}

/// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
///
/// This does not copy the data.
///
/// The first element of the return value is the longest prefix of valid
/// UTF-8, with the second element being the remainder.
pub fn into_string_split(self) -> (String, Wtf8Buf) {
if self.is_known_utf8 {
// SAFETY: The inner value is known to be UTF-8.
let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
return (utf8, Wtf8Buf::new());
}

let surrogate_pos = match self.next_surrogate(0) {
None => {
// SAFETY: Well-formed WTF-8 that contains no surrogates is
// also well-formed UTF-8.
let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
return (utf8, Wtf8Buf::new());
}
Some((surrogate_pos, _)) => surrogate_pos,
};

if surrogate_pos == 0 {
return (String::new(), self);
}

let mut utf8_bytes = self.bytes;
let wtf8_bytes = utf8_bytes.split_off(surrogate_pos);
// SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
// surrogates, and well-formed WTF-8 that contains no surrogates is
// also well-formed UTF-8.
let utf8 = unsafe { String::from_utf8_unchecked(utf8_bytes) };
(utf8, Wtf8Buf { bytes: wtf8_bytes, is_known_utf8: false })
}

/// Converts this `Wtf8Buf` into a boxed `Wtf8`.
#[inline]
pub fn into_box(self) -> Box<Wtf8> {
Expand Down Expand Up @@ -664,6 +700,38 @@ impl Wtf8 {
}
}

/// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
///
/// This does not copy the data.
///
/// The first element of the return value is the longest prefix of valid
/// UTF-8, with the second element being the remainder.
pub fn to_str_split(&self) -> (&str, &Wtf8) {
let surrogate_pos = match self.next_surrogate(0) {
None => {
// SAFETY: Well-formed WTF-8 that contains no surrogates is
// also well-formed UTF-8.
let utf8 = unsafe { str::from_utf8_unchecked(&self.bytes) };
return (utf8, Wtf8::from_str(""));
}
Some((surrogate_pos, _)) => surrogate_pos,
};

if surrogate_pos == 0 {
return ("", self);
}

let (utf8_bytes, wtf8_bytes) = self.bytes.split_at(surrogate_pos);
// SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
// surrogates, and well-formed WTF-8 that contains no surrogates is
// also well-formed UTF-8.
unsafe {
let utf8 = str::from_utf8_unchecked(utf8_bytes);
let wtf8 = Wtf8::from_bytes_unchecked(wtf8_bytes);
(utf8, wtf8)
}
}

/// Converts the WTF-8 string to potentially ill-formed UTF-16
/// and return an iterator of 16-bit code units.
///
Expand Down
34 changes: 34 additions & 0 deletions library/std/src/sys_common/wtf8/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,26 @@ fn wtf8buf_into_string_lossy() {
assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
}

#[test]
fn wtf8buf_into_string_split() {
// is_known_utf8
let mut string = Wtf8Buf::from_str("aé");
assert_eq!(string.clone().into_string_split(), (String::from("aé"), Wtf8Buf::new()),);

// !is_known_utf8, next_surrogate(0).is_none()
string.push_char(' ');
string.push(CodePoint::from_u32(0xD83D).unwrap());
string.push(CodePoint::from_u32(0xDCA9).unwrap());
assert_eq!(string.clone().into_string_split(), (String::from("aé 💩"), Wtf8Buf::new()),);

// !is_known_utf8, next_surrogate(0).is_some()
string.push(CodePoint::from_u32(0xD800).unwrap());
assert_eq!(
string.clone().into_string_split(),
(String::from("aé 💩"), Wtf8Buf::from_wide(&[0xD800])),
);
}

#[test]
fn wtf8buf_from_iterator() {
fn f(values: &[u32]) -> Wtf8Buf {
Expand Down Expand Up @@ -538,6 +558,20 @@ fn wtf8_to_string_lossy() {
assert_eq!(string.to_string_lossy(), expected);
}

#[test]
fn wtf8_to_str_split() {
// next_surrogate(0).is_none()
let mut string = Wtf8Buf::from_str("aé 💩");
assert_eq!(string.as_slice().to_str_split(), ("aé 💩", Wtf8::from_str("")),);

// next_surrogate(0).is_some()
string.push(CodePoint::from_u32(0xD800).unwrap());
assert_eq!(
string.as_slice().to_str_split(),
("aé 💩", Wtf8Buf::from_wide(&[0xD800]).as_slice()),
);
}

#[test]
fn wtf8_display() {
fn d(b: &[u8]) -> String {
Expand Down

0 comments on commit 0f93dae

Please sign in to comment.