From 1c79e46e01496d9b4b0c95084078bdcd709094f2 Mon Sep 17 00:00:00 2001 From: Robert Forsman Date: Thu, 7 Apr 2022 17:00:59 -0400 Subject: [PATCH] new method next_code_point_val() which is just like next_code_point, but it accepts an iterator over u8 instead of &u8 --- library/alloc/tests/lib.rs | 1 + library/alloc/tests/str.rs | 69 +++++++++++++++++++++++++++++ library/core/src/str/mod.rs | 2 +- library/core/src/str/validations.rs | 21 +++++++-- 4 files changed, 88 insertions(+), 5 deletions(-) diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index abce47e5afe1a..6df3a9ef6e046 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -38,6 +38,7 @@ #![feature(const_str_from_utf8)] #![feature(nonnull_slice_from_raw_parts)] #![feature(panic_update_hook)] +#![feature(str_internals)] use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 273b39aa45a48..f9196f87cb39e 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1,3 +1,4 @@ +use core::str::next_code_point_val; use std::assert_matches::assert_matches; use std::borrow::Cow; use std::cmp::Ordering::{Equal, Greater, Less}; @@ -2367,3 +2368,71 @@ fn ceil_char_boundary() { fn ceil_char_boundary_above_len_panic() { let _ = "x".ceil_char_boundary(2); } + +fn check_decoded_string>(mut iter: &mut I, expected: &str) { + for char in expected.chars() { + assert_eq!(Some(char as u32), unsafe { next_code_point_val(&mut iter) }); + } + assert_eq!(None, unsafe { next_code_point_val(&mut iter) }); +} + +#[test] +pub fn dirt_simple_code_point() { + unsafe { + let src = b"banana"; + let mut iter = src.iter().copied(); + for char in "banana".chars() { + assert_eq!(Some(char as u32), next_code_point_val(&mut iter)); + } + + let tmp = next_code_point_val(&mut iter); + assert_eq!(None, tmp); + } + { + let src = [ + b'd', b'a', b'i', b' ', 229, 164, 167, 232, 179, 162, 232, 128, 133, b' ', b'k', b'e', + b'n', b'j', b'a', + ]; + + let mut iter = src.into_iter(); + + check_decoded_string(&mut iter, "dai 大賢者 kenja"); + } +} + +struct Shenanigans { + compressed: Vec, + cursor: usize, +} + +// This provides an example of a u8 iterator which can not use Item=&u8. +// A real-world case is a string stored in progmem on an AVR, which can +// not be incorporated into a unit test +impl Iterator for Shenanigans { + type Item = u8; + fn next(&mut self) -> Option<::Item> { + let end = self.cursor + 6; + let i1 = self.cursor / 8; + let i2 = (end - 1) / 8; + if i2 >= self.compressed.len() { + return None; + } + let base64 = if i1 == i2 { + self.compressed[i1] >> (2 - self.cursor % 8) & 0x3f + } else { + 0x3f & ((self.compressed[i1] << (self.cursor % 8 - 2)) + | (self.compressed[i2] >> (10 - self.cursor % 8))) + }; + self.cursor += 6; + Some(base64 + b' ') + } +} + +#[test] +pub fn fancy_code_point() { + let mut iter = + Shenanigans { compressed: vec![142, 139, 236, 228, 10, 238, 166, 122, 52], cursor: 0 }; + for char in "CHOLY KNIGHT".chars() { + assert_eq!(Some(char as u32), unsafe { next_code_point_val(&mut iter) }); + } +} diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 6bfa6a5e01519..4debb23b46a29 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -70,7 +70,7 @@ pub use iter::SplitAsciiWhitespace; pub use iter::SplitInclusive; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{next_code_point, next_code_point_val, utf8_char_width}; use iter::MatchIndicesInternal; use iter::SplitInternal; diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 0d3dc856be577..741b2d11f6c79 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -34,8 +34,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { #[unstable(feature = "str_internals", issue = "none")] #[inline] pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> Option { + // SAFETY: `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string + unsafe { next_code_point_val(&mut bytes.copied()) } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub unsafe fn next_code_point_val>(bytes: &mut I) -> Option { // Decode UTF-8 - let x = *bytes.next()?; + let x = bytes.next()?; if x < 128 { return Some(x as u32); } @@ -46,14 +59,14 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> let init = utf8_first_byte(x, 2); // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let y = unsafe { *bytes.next().unwrap_unchecked() }; + let y = unsafe { bytes.next().unwrap_unchecked() }; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let z = unsafe { *bytes.next().unwrap_unchecked() }; + let z = unsafe { bytes.next().unwrap_unchecked() }; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { @@ -61,7 +74,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> // use only the lower 3 bits of `init` // SAFETY: `bytes` produces an UTF-8-like string, // so the iterator must produce a value here. - let w = unsafe { *bytes.next().unwrap_unchecked() }; + let w = unsafe { bytes.next().unwrap_unchecked() }; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } }