Add OsStr::to_str_split() and OsString::into_string_split().

rust-lang · May 1, 2023 · 0f93dae · 0f93dae
1 parent 9ecda8d
commit 0f93dae
Show file tree

Hide file tree

Showing 6 changed files with 238 additions and 0 deletions.
diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs
@@ -178,6 +178,34 @@ impl OsString {
  self.inner.into_string().map_err(|buf| OsString { inner: buf })
  }
 
+ /// Splits the `OsString` into a Unicode prefix and non-Unicode suffix.
+ ///
+ /// The returned `String` is the longest prefix of the `OsString` that
+ /// contained valid Unicode. The returned `OsString` is the rest of the
+ /// original value.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// #![feature(osstr_str_prefix_ops)]
+ ///
+ /// use std::ffi::OsString;
+ ///
+ /// let os_string = OsString::from("foo");
+ /// let (prefix, suffix) = os_string.clone().into_string_split();
+ ///
+ /// let mut rejoined = OsString::from(prefix);
+ /// rejoined.push(suffix);
+ /// assert_eq!(rejoined, os_string);
+ /// ```
+ #[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
+ #[must_use]
+ #[inline]
+ pub fn into_string_split(self) -> (String, OsString) {
+ let (prefix, suffix) = self.inner.into_string_split();
+ (prefix, OsString { inner: suffix })
+ }
+
  /// Extends the string with the given <code>&[OsStr]</code> slice.
  ///
  /// # Examples
@@ -703,6 +731,34 @@ impl OsStr {
  self.inner.to_str()
  }
 
+ /// Splits the `OsStr` into a Unicode prefix and non-Unicode suffix.
+ ///
+ /// The returned `str` is the longest prefix of the `OsStr` that
+ /// contained valid Unicode. The returned `OsStr` is the rest of the
+ /// original value.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// #![feature(osstr_str_prefix_ops)]
+ ///
+ /// use std::ffi::{OsStr, OsString};
+ ///
+ /// let os_str = OsStr::new("foo");
+ /// let (prefix, suffix) = os_str.to_str_split();
+ ///
+ /// let mut rejoined = OsString::from(prefix);
+ /// rejoined.push(suffix);
+ /// assert_eq!(rejoined, os_str);
+ /// ```
+ #[unstable(feature = "osstr_str_prefix_ops", issue = "none")]
+ #[must_use]
+ #[inline]
+ pub fn to_str_split(&self) -> (&str, &OsStr) {
+ let (prefix, suffix) = self.inner.to_str_split();
+ (prefix, Self::from_inner(suffix))
+ }
+
  /// Converts an `OsStr` to a <code>[Cow]<[str]></code>.
  ///
  /// Any non-Unicode sequences are replaced with

diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs
@@ -164,6 +164,27 @@ impl Buf {
  String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() })
  }
 
+ pub fn into_string_split(self) -> (String, Buf) {
+ let utf8_err = match str::from_utf8(&self.inner) {
+ Ok(_) => {
+ // SAFETY: If `str::from_utf8()` succeeds then the input is UTF-8.
+ let prefix = unsafe { String::from_utf8_unchecked(self.inner) };
+ return (prefix, Buf { inner: Vec::new() });
+ }
+ Err(err) => err,
+ };
+ let utf8_len = utf8_err.valid_up_to();
+ if utf8_len == 0 {
+ return (String::new(), self);
+ }
+ let mut utf8_bytes = self.inner;
+ let rem_bytes = utf8_bytes.split_off(utf8_len);
+ // SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
+ // valid UTF-8 has been verified.
+ let prefix = unsafe { String::from_utf8_unchecked(utf8_bytes) };
+ (prefix, Buf { inner: rem_bytes })
+ }
+
  pub fn push_slice(&mut self, s: &Slice) {
  self.inner.extend_from_slice(&s.inner)
  }
@@ -205,6 +226,21 @@ impl Slice {
  str::from_utf8(&self.inner).ok()
  }
 
+ pub fn to_str_split(&self) -> (&str, &Slice) {
+ let utf8_err = match str::from_utf8(&self.inner) {
+ Ok(prefix) => return (prefix, Slice::from_u8_slice(b"")),
+ Err(err) => err,
+ };
+ let utf8_len = utf8_err.valid_up_to();
+ if utf8_len == 0 {
+ return ("", self);
+ }
+ // SAFETY: `Utf8Error::valid_up_to()` returns an index up to which
+ // valid UTF-8 has been verified.
+ let prefix = unsafe { str::from_utf8_unchecked(&self.inner[..utf8_len]) };
+ (prefix, Slice::from_u8_slice(&self.inner[utf8_len..]))
+ }
+
  pub fn to_string_lossy(&self) -> Cow<'_, str> {
  String::from_utf8_lossy(&self.inner)
  }

diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs
@@ -16,3 +16,37 @@ fn display() {
  Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
  );
 }
+
+#[test]
+fn buf_into_string_split() {
+ let mut string = Buf::from_string(String::from("héllô wørld"));
+ {
+ let (prefix, suffix) = string.clone().into_string_split();
+ assert_eq!(prefix, String::from("héllô wørld"));
+ assert_eq!(suffix.into_inner(), Vec::new());
+ }
+
+ string.push_slice(Slice::from_u8_slice(b"\xFF"));
+ {
+ let (prefix, suffix) = string.clone().into_string_split();
+ assert_eq!(prefix, String::from("héllô wørld"));
+ assert_eq!(suffix.into_inner(), vec![0xFF]);
+ }
+}
+
+#[test]
+fn slice_to_str_split() {
+ let mut string = Buf::from_string(String::from("héllô wørld"));
+ {
+ let (prefix, suffix) = string.as_slice().to_str_split();
+ assert_eq!(prefix, "héllô wørld");
+ assert_eq!(&suffix.inner, b"");
+ }
+
+ string.push_slice(Slice::from_u8_slice(b"\xFF"));
+ {
+ let (prefix, suffix) = string.as_slice().to_str_split();
+ assert_eq!(prefix, String::from("héllô wørld"));
+ assert_eq!(&suffix.inner, b"\xFF");
+ }
+}
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
@@ -98,6 +98,11 @@ impl Buf {
  self.inner.into_string().map_err(|buf| Buf { inner: buf })
  }
 
+ pub fn into_string_split(self) -> (String, Buf) {
+ let (prefix, suffix) = self.inner.into_string_split();
+ (prefix, Buf { inner: suffix })
+ }
+
  pub fn push_slice(&mut self, s: &Slice) {
  self.inner.push_wtf8(&s.inner)
  }
@@ -159,6 +164,11 @@ impl Slice {
  self.inner.as_str()
  }
 
+ pub fn to_str_split(&self) -> (&str, &Slice) {
+ let (prefix, suffix) = self.inner.to_str_split();
+ (prefix, Slice { inner: suffix })
+ }
+
  pub fn to_string_lossy(&self) -> Cow<'_, str> {
  self.inner.to_string_lossy()
  }

diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs
@@ -441,6 +441,42 @@ impl Wtf8Buf {
  }
  }
 
+ /// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
+ ///
+ /// This does not copy the data.
+ ///
+ /// The first element of the return value is the longest prefix of valid
+ /// UTF-8, with the second element being the remainder.
+ pub fn into_string_split(self) -> (String, Wtf8Buf) {
+ if self.is_known_utf8 {
+ // SAFETY: The inner value is known to be UTF-8.
+ let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
+ return (utf8, Wtf8Buf::new());
+ }
+
+ let surrogate_pos = match self.next_surrogate(0) {
+ None => {
+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
+ // also well-formed UTF-8.
+ let utf8 = unsafe { String::from_utf8_unchecked(self.bytes) };
+ return (utf8, Wtf8Buf::new());
+ }
+ Some((surrogate_pos, _)) => surrogate_pos,
+ };
+
+ if surrogate_pos == 0 {
+ return (String::new(), self);
+ }
+
+ let mut utf8_bytes = self.bytes;
+ let wtf8_bytes = utf8_bytes.split_off(surrogate_pos);
+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
+ // surrogates, and well-formed WTF-8 that contains no surrogates is
+ // also well-formed UTF-8.
+ let utf8 = unsafe { String::from_utf8_unchecked(utf8_bytes) };
+ (utf8, Wtf8Buf { bytes: wtf8_bytes, is_known_utf8: false })
+ }
+
  /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
  #[inline]
  pub fn into_box(self) -> Box<Wtf8> {
@@ -664,6 +700,38 @@ impl Wtf8 {
  }
  }
 
+ /// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
+ ///
+ /// This does not copy the data.
+ ///
+ /// The first element of the return value is the longest prefix of valid
+ /// UTF-8, with the second element being the remainder.
+ pub fn to_str_split(&self) -> (&str, &Wtf8) {
+ let surrogate_pos = match self.next_surrogate(0) {
+ None => {
+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
+ // also well-formed UTF-8.
+ let utf8 = unsafe { str::from_utf8_unchecked(&self.bytes) };
+ return (utf8, Wtf8::from_str(""));
+ }
+ Some((surrogate_pos, _)) => surrogate_pos,
+ };
+
+ if surrogate_pos == 0 {
+ return ("", self);
+ }
+
+ let (utf8_bytes, wtf8_bytes) = self.bytes.split_at(surrogate_pos);
+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
+ // surrogates, and well-formed WTF-8 that contains no surrogates is
+ // also well-formed UTF-8.
+ unsafe {
+ let utf8 = str::from_utf8_unchecked(utf8_bytes);
+ let wtf8 = Wtf8::from_bytes_unchecked(wtf8_bytes);
+ (utf8, wtf8)
+ }
+ }
+
  /// Converts the WTF-8 string to potentially ill-formed UTF-16
  /// and return an iterator of 16-bit code units.
  ///

diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs
@@ -352,6 +352,26 @@ fn wtf8buf_into_string_lossy() {
  assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
 }
 
+#[test]
+fn wtf8buf_into_string_split() {
+ // is_known_utf8
+ let mut string = Wtf8Buf::from_str("aé");
+ assert_eq!(string.clone().into_string_split(), (String::from("aé"), Wtf8Buf::new()),);
+
+ // !is_known_utf8, next_surrogate(0).is_none()
+ string.push_char(' ');
+ string.push(CodePoint::from_u32(0xD83D).unwrap());
+ string.push(CodePoint::from_u32(0xDCA9).unwrap());
+ assert_eq!(string.clone().into_string_split(), (String::from("aé 💩"), Wtf8Buf::new()),);
+
+ // !is_known_utf8, next_surrogate(0).is_some()
+ string.push(CodePoint::from_u32(0xD800).unwrap());
+ assert_eq!(
+ string.clone().into_string_split(),
+ (String::from("aé 💩"), Wtf8Buf::from_wide(&[0xD800])),
+ );
+}
+
 #[test]
 fn wtf8buf_from_iterator() {
  fn f(values: &[u32]) -> Wtf8Buf {
@@ -538,6 +558,20 @@ fn wtf8_to_string_lossy() {
  assert_eq!(string.to_string_lossy(), expected);
 }
 
+#[test]
+fn wtf8_to_str_split() {
+ // next_surrogate(0).is_none()
+ let mut string = Wtf8Buf::from_str("aé 💩");
+ assert_eq!(string.as_slice().to_str_split(), ("aé 💩", Wtf8::from_str("")),);
+
+ // next_surrogate(0).is_some()
+ string.push(CodePoint::from_u32(0xD800).unwrap());
+ assert_eq!(
+ string.as_slice().to_str_split(),
+ ("aé 💩", Wtf8Buf::from_wide(&[0xD800]).as_slice()),
+ );
+}
+
 #[test]
 fn wtf8_display() {
  fn d(b: &[u8]) -> String {