diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 8497740990443..b6134256a7c78 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -10,8 +10,8 @@ use core::borrow::{Borrow, BorrowMut}; use core::iter::FusedIterator; use core::mem; +use core::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use core::ptr; -use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use core::unicode::conversions; use crate::borrow::ToOwned; @@ -268,7 +268,7 @@ impl str { without modifying the original"] #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn replace<'a, P: Pattern<'a>>(&'a self, from: P, to: &str) -> String { + pub fn replace<'a, P: Pattern<&'a str>>(&'a self, from: P, to: &str) -> String { let mut result = String::new(); let mut last_end = 0; for (start, part) in self.match_indices(from) { @@ -308,7 +308,7 @@ impl str { #[must_use = "this returns the replaced string as a new allocation, \ without modifying the original"] #[stable(feature = "str_replacen", since = "1.16.0")] - pub fn replacen<'a, P: Pattern<'a>>(&'a self, pat: P, to: &str, count: usize) -> String { + pub fn replacen<'a, P: Pattern<&'a str>>(&'a self, pat: P, to: &str, count: usize) -> String { // Hope to reduce the times of re-allocation let mut result = String::with_capacity(32); let mut last_end = 0; diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index b9ef76c109abf..da58797ad11d4 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -55,9 +55,9 @@ use core::ops::AddAssign; #[cfg(not(no_global_oom_handling))] use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ops::{self, Index, IndexMut, Range, RangeBounds}; +use core::pattern::Pattern; use core::ptr; use core::slice; -use core::str::pattern::Pattern; #[cfg(not(no_global_oom_handling))] use core::str::Utf8Chunks; @@ -1371,9 +1371,9 @@ impl String { #[unstable(feature = "string_remove_matches", reason = "new API", issue = "72826")] pub fn remove_matches<'a, P>(&'a mut self, pat: P) where - P: for<'x> Pattern<'x>, + P: for<'x> Pattern<&'x str>, { - use core::str::pattern::Searcher; + use core::pattern::Searcher; let rejections = { let mut searcher = pat.into_searcher(self); @@ -2174,10 +2174,10 @@ impl<'a> Extend> for String { reason = "API not fully fleshed out and ready to be stabilized", issue = "27721" )] -impl<'a, 'b> Pattern<'a> for &'b String { - type Searcher = <&'b str as Pattern<'a>>::Searcher; +impl<'a, 'b> Pattern<&'a str> for &'b String { + type Searcher = <&'b str as Pattern<&'a str>>::Searcher; - fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<'a>>::Searcher { + fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<&'a str>>::Searcher { self[..].into_searcher(haystack) } diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index c1dbbde08b6b9..8334c79e5cad5 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1868,14 +1868,14 @@ fn test_repeat() { } mod pattern { - use std::str::pattern::SearchStep::{self, Done, Match, Reject}; - use std::str::pattern::{Pattern, ReverseSearcher, Searcher}; + use core::pattern::SearchStep::{self, Done, Match, Reject}; + use core::pattern::{Pattern, ReverseSearcher, Searcher}; macro_rules! make_test { ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { #[allow(unused_imports)] mod $name { - use std::str::pattern::SearchStep::{Match, Reject}; + use core::pattern::SearchStep::{Match, Reject}; use super::{cmp_search_to_vec}; #[test] fn fwd() { @@ -1891,7 +1891,7 @@ mod pattern { fn cmp_search_to_vec<'a>( rev: bool, - pat: impl Pattern<'a, Searcher: ReverseSearcher<'a>>, + pat: impl Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, haystack: &'a str, right: Vec, ) { @@ -1972,7 +1972,7 @@ mod pattern { str_searcher_multibyte_haystack, " ", "├──", - [Reject(0, 3), Reject(3, 6), Reject(6, 9),] + [Reject(0, 9),] ); make_test!( str_searcher_empty_needle_multibyte_haystack, @@ -2008,13 +2008,13 @@ mod pattern { char_searcher_multibyte_haystack, ' ', "├──", - [Reject(0, 3), Reject(3, 6), Reject(6, 9),] + [Reject(0, 9),] ); make_test!( char_searcher_short_haystack, '\u{1F4A9}', "* \t", - [Reject(0, 1), Reject(1, 2), Reject(2, 3),] + [Reject(0, 3),] ); // See #85462 @@ -2151,11 +2151,11 @@ generate_iterator_test! { #[test] fn different_str_pattern_forwarding_lifetimes() { - use std::str::pattern::Pattern; + use core::pattern::Pattern; fn foo<'a, P>(p: P) where - for<'b> &'b P: Pattern<'a>, + for<'b> &'b P: Pattern<&'a str>, { for _ in 0..3 { "asdf".find(&p); diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index ed0c05a686319..eae36759e6807 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -362,8 +362,11 @@ pub mod sync; pub mod fmt; pub mod hash; +pub mod pattern; pub mod slice; pub mod str; +#[allow(missing_docs)] +pub mod str_bytes; pub mod time; pub mod unicode; diff --git a/library/core/src/pattern.rs b/library/core/src/pattern.rs new file mode 100644 index 0000000000000..0a50ebb1b725a --- /dev/null +++ b/library/core/src/pattern.rs @@ -0,0 +1,875 @@ +//! The Pattern API. +//! +//! The Pattern API provides a generic mechanism for using different pattern +//! types when searching through different objects. +//! +//! For more details, see the traits [`Pattern`], [`Haystack`], [`Searcher`], +//! [`ReverseSearcher`] and [`DoubleEndedSearcher`]. Although this API is +//! unstable, it is exposed via stable methods on corresponding haystack types. +//! +//! # Examples +//! +//! [`Pattern<&str>`] is [implemented][pattern-impls] in the stable API for +//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures +//! implementing `FnMut(char) -> bool`. +//! +//! ``` +//! let s = "Can you find a needle in a haystack?"; +//! +//! // &str pattern +//! assert_eq!(s.find("you"), Some(4)); +//! // char pattern +//! assert_eq!(s.find('n'), Some(2)); +//! // array of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); +//! // slice of chars pattern +//! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); +//! // closure pattern +//! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! ``` +//! +//! [pattern-impls]: Pattern#implementors + +#![unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] + +use crate::fmt; +use crate::mem::{replace, take}; +use crate::ops::Range; + +/// A pattern which can be matched against a [`Haystack`]. +/// +/// A `Pattern` expresses that the implementing type can be used as a pattern +/// for searching in an `H`. For example, character `'a'` and string `"aa"` are +/// patterns that would match at index `1` in the string `"baaaab"`. +/// +/// The trait itself acts as a builder for an associated [`Searcher`] type, +/// which does the actual work of finding occurrences of the pattern in +/// a string. +/// +/// Depending on the type of the haystack and the pattern, the semantics of the +/// pattern can change. The table below describes some of those behaviours for +/// a [`&str`][str] haystack. +/// +/// | Pattern type | Match condition | +/// |--------------------------|-------------------------------------------| +/// | `&str` | is substring | +/// | `char` | is contained in string | +/// | `&[char]` | any char in slice is contained in string | +/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +/// +/// # Examples +/// +/// ``` +/// // &str pattern matching &str +/// assert_eq!("abaaa".find("ba"), Some(1)); +/// assert_eq!("abaaa".find("bac"), None); +/// +/// // char pattern matching &str +/// assert_eq!("abaaa".find('a'), Some(0)); +/// assert_eq!("abaaa".find('b'), Some(1)); +/// assert_eq!("abaaa".find('c'), None); +/// +/// // &[char; N] pattern matching &str +/// assert_eq!("ab".find(&['b', 'a']), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd']), None); +/// +/// // &[char] pattern matching &str +/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); +/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); +/// +/// // FnMut(char) -> bool pattern matching &str +/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); +/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); +/// ``` +pub trait Pattern: Sized { + /// Associated searcher for this pattern. + type Searcher: Searcher; + + /// Constructs the associated searcher from `self` and the `haystack` to + /// search in. + fn into_searcher(self, haystack: H) -> Self::Searcher; + + /// Checks whether the pattern matches anywhere in the haystack. + fn is_contained_in(self, haystack: H) -> bool { + self.into_searcher(haystack).next_match().is_some() + } + + /// Checks whether the pattern matches at the front of the haystack. + fn is_prefix_of(self, haystack: H) -> bool { + matches!(self.into_searcher(haystack).next(), SearchStep::Match(..)) + } + + /// Checks whether the pattern matches at the back of the haystack. + fn is_suffix_of(self, haystack: H) -> bool + where + Self::Searcher: ReverseSearcher, + { + matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(..)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: H) -> Option { + if let SearchStep::Match(start, pos) = self.into_searcher(haystack).next() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + start == haystack.cursor_at_front(), + "The first search step from Searcher \ + must include the first character" + ); + let end = haystack.cursor_at_back(); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.get_unchecked(pos..end) }) + } else { + None + } + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: H) -> Option + where + Self::Searcher: ReverseSearcher, + { + if let SearchStep::Match(pos, end) = self.into_searcher(haystack).next_back() { + // This cannot be debug_assert_eq because StartCursor isn’t Debug. + debug_assert!( + end == haystack.cursor_at_back(), + "The first search step from ReverseSearcher \ + must include the last character" + ); + let start = haystack.cursor_at_front(); + // SAFETY: `Searcher` is known to return valid indices. + Some(unsafe { haystack.get_unchecked(start..pos) }) + } else { + None + } + } +} + +/// A type which can be searched in using a [`Pattern`]. +/// +/// The trait is used in combination with [`Pattern`] trait to express a pattern +/// that can be used to search for elements in given haystack. +pub trait Haystack: Sized + Copy { + /// A cursor representing position in the haystack or its end. + type Cursor: Copy + PartialEq; + + /// Returns cursor pointing at the beginning of the haystack. + fn cursor_at_front(self) -> Self::Cursor; + + /// Returns cursor pointing at the end of the haystack. + fn cursor_at_back(self) -> Self::Cursor; + + /// Returns whether the haystack is empty. + fn is_empty(self) -> bool; + + /// Returns portions of the haystack indicated by the cursor range. + /// + /// # Safety + /// + /// Range’s start and end must be valid haystack split positions. + /// Furthermore, start mustn’t point at position after end. + /// + /// A valid split positions are: + /// - the front of the haystack (as returned by + /// [`cursor_at_front()`][Self::cursor_at_front], + /// - the back of the haystack (as returned by + /// [`cursor_at_back()`][Self::cursor_at_back] or + /// - any cursor returned by a [`Searcher`] or [`ReverseSearcher`]. + unsafe fn get_unchecked(self, range: Range) -> Self; +} + +/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub enum SearchStep { + /// Expresses that a match of the pattern has been found at + /// `haystack[a..b]`. + Match(T, T), + /// Expresses that `haystack[a..b]` has been rejected as a possible match of + /// the pattern. + /// + /// Note that there might be more than one `Reject` between two `Match`es, + /// there is no requirement for them to be combined into one. + Reject(T, T), + /// Expresses that every element of the haystack has been visited, ending + /// the iteration. + Done, +} + +/// Possible return type of a search. +/// +/// It abstract differences between `next`, `next_match` and `next_reject` +/// methods. Depending on return type an implementation for those functions +/// will generate matches and rejects, only matches or only rejects. +#[unstable(feature = "pattern_internals", issue = "none")] +pub trait SearchResult: Sized + sealed::Sealed { + /// Value indicating searching has finished. + const DONE: Self; + + /// Whether search should return reject as soon as possible. + /// + /// For example, if a search can quickly determine that the very next + /// position cannot be where a next match starts, it should return a reject + /// with that position. This is an optimisation which allows the algorithm + /// to not waste time looking for the next match if caller is only + /// interested in the next position of a reject. + /// + /// If this is `true`, [`rejecting()`][Self::rejecting] is guaranteed to + /// return `Some` and if this is `false`, [`matching()`][Self::matching] is + /// guaranteed to return `Some`. + const USE_EARLY_REJECT: bool; + + /// Returns value describing a match or `None` if this implementation + /// doesn’t care about matches. + fn matching(start: T, end: T) -> Option; + + /// Returns value describing a reject or `None` if this implementation + /// doesn’t care about matches. + fn rejecting(start: T, end: T) -> Option; +} + +/// A wrapper for result type which only carries information about matches. +#[unstable(feature = "pattern_internals", issue = "none")] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub struct MatchOnly(pub Option<(T, T)>); + +/// A wrapper for result type which only carries information about rejects. +#[unstable(feature = "pattern_internals", issue = "none")] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] +pub struct RejectOnly(pub Option<(T, T)>); + +impl SearchResult for SearchStep { + const DONE: Self = SearchStep::Done; + const USE_EARLY_REJECT: bool = false; + + #[inline(always)] + fn matching(s: T, e: T) -> Option { + Some(SearchStep::Match(s, e)) + } + + #[inline(always)] + fn rejecting(s: T, e: T) -> Option { + Some(SearchStep::Reject(s, e)) + } +} + +impl SearchResult for MatchOnly { + const DONE: Self = Self(None); + const USE_EARLY_REJECT: bool = false; + + #[inline(always)] + fn matching(s: T, e: T) -> Option { + Some(Self(Some((s, e)))) + } + + #[inline(always)] + fn rejecting(_s: T, _e: T) -> Option { + None + } +} + +impl SearchResult for RejectOnly { + const DONE: Self = Self(None); + const USE_EARLY_REJECT: bool = true; + + #[inline(always)] + fn matching(_s: T, _e: T) -> Option { + None + } + + #[inline(always)] + fn rejecting(s: T, e: T) -> Option { + Some(Self(Some((s, e)))) + } +} + +mod sealed { + pub trait Sealed {} + impl Sealed for super::SearchStep {} + impl Sealed for super::MatchOnly {} + impl Sealed for super::RejectOnly {} +} + +/// A searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping matches of +/// a pattern starting from the front of a haystack `H`. +/// +/// It will be implemented by associated `Searcher` types of the [`Pattern`] +/// trait. +/// +/// The trait is marked unsafe because the indices returned by the +/// [`next()`][Searcher::next] methods are required to lie on valid haystack +/// split positions. This enables consumers of this trait to slice the haystack +/// without additional runtime checks. +pub unsafe trait Searcher { + /// Getter for the underlying string to be searched in + /// + /// Will always return the same haystack that was used when creating the + /// searcher. + fn haystack(&self) -> H; + + /// Performs the next search step starting from the front. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches + /// the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can + /// not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has + /// been visited. + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, + /// covering the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` might + /// produce the stream `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, + /// 8)]` + fn next(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. See + /// [`next()`][Searcher::next]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the + /// returned ranges of this and [`next_reject`][Searcher::next_reject] will + /// overlap. This will return `(start_match, end_match)`, where start_match + /// is the index of where the match begins, and end_match is the index after + /// the end of the match. + fn next_match(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. See + /// [`next()`][Searcher::next] and [`next_match()`][Searcher::next_match]. + /// + /// Unlike [`next()`][Searcher::next], there is no guarantee that the + /// returned ranges of this and [`next_match`][Searcher::next_match] will + /// overlap. + fn next_reject(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A reverse searcher for a string pattern. +/// +/// This trait provides methods for searching for non-overlapping matches of +/// a pattern starting from the back of a haystack `H`. +/// +/// It will be implemented by associated [`Searcher`] types of the [`Pattern`] +/// trait if the pattern supports searching for it from the back. +/// +/// The index ranges returned by this trait are not required to exactly match +/// those of the forward search in reverse. +/// +/// For the reason why this trait is marked unsafe, see the parent trait +/// [`Searcher`]. +pub unsafe trait ReverseSearcher: Searcher { + /// Performs the next search step starting from the back. + /// + /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` + /// matches the pattern. + /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` + /// can not match the pattern, even partially. + /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack + /// has been visited + /// + /// The stream of [`Match`][SearchStep::Match] and + /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] + /// will contain index ranges that are adjacent, non-overlapping, covering + /// the whole haystack, and laying on utf8 boundaries. + /// + /// A [`Match`][SearchStep::Match] result needs to contain the whole matched + /// pattern, however [`Reject`][SearchStep::Reject] results may be split up + /// into arbitrary many adjacent fragments. Both ranges may have zero + /// length. + /// + /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` might + /// produce the stream `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, + /// 1)]`. + fn next_back(&mut self) -> SearchStep; + + /// Finds the next [`Match`][SearchStep::Match] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_match_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } + + /// Finds the next [`Reject`][SearchStep::Reject] result. + /// See [`next_back()`][ReverseSearcher::next_back]. + fn next_reject_back(&mut self) -> Option<(H::Cursor, H::Cursor)> { + loop { + match self.next_back() { + SearchStep::Reject(a, b) => return Some((a, b)), + SearchStep::Done => return None, + _ => continue, + } + } + } +} + +/// A marker trait to express that a [`ReverseSearcher`] can be used for +/// a [`DoubleEndedIterator`] implementation. +/// +/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need to follow +/// these conditions: +/// +/// - All results of `next()` need to be identical to the results of +/// `next_back()` in reverse order. +/// - `next()` and `next_back()` need to behave as the two ends of a range of +/// values, that is they can not "walk past each other". +/// +/// # Examples +/// +/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a [`char`] +/// only requires looking at one at a time, which behaves the same from both +/// ends. +/// +/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because the pattern `"aa"` +/// in the haystack `"aaa"` matches as either `"[aa]a"` or `"a[aa]"`, depending +/// from which side it is searched. +pub trait DoubleEndedSearcher: ReverseSearcher {} + +/// A wrapper around single-argument function returning a boolean, +/// i.e. a predicate function. +/// +/// `Predicate` objects are created with [`predicate`] function. +#[derive(Clone, Debug)] +pub struct Predicate(F); + +/// Constructs a wrapper for a single-argument function returning a boolean, +/// i.e. a predicate function. +/// +/// This is intended to be used as a pattern when working with haystacks which +/// (for whatever reason) cannot support naked function traits as patterns. +pub fn predicate bool>(pred: F) -> Predicate { + Predicate(pred) +} + +impl Predicate { + /// Executes the predicate returning its result. + pub fn test(&mut self, element: T) -> bool + where + F: FnMut(T) -> bool, + { + self.0(element) + } + + /// Returns reference to the wrapped predicate function. + pub fn as_fn(&mut self) -> &mut F + where + F: FnMut(T) -> bool, + { + &mut self.0 + } + + /// Consumes this object and returns wrapped predicate function. + pub fn into_fn(self) -> F + where + F: FnMut(T) -> bool, + { + self.0 + } +} + +////////////////////////////////////////////////////////////////////////////// +// Internal EmptyNeedleSearcher helper +////////////////////////////////////////////////////////////////////////////// + +/// Helper for implementing searchers looking for empty patterns. +/// +/// An empty pattern matches around every element of a haystack. For example, +/// within a `&str` it matches around every character. (This includes at the +/// beginning and end of the string). +/// +/// This struct helps implement searchers for empty patterns for various +/// haystacks. The only requirement is a function which advances the start +/// position or end position of the haystack range. +/// +/// # Examples +/// +/// ``` +/// #![feature(pattern, pattern_internals)] +/// use core::pattern::{EmptyNeedleSearcher, SearchStep}; +/// +/// let haystack = "fóó"; +/// let mut searcher = EmptyNeedleSearcher::new(haystack); +/// let advance = |range: core::ops::Range| { +/// range.start + haystack[range].chars().next().unwrap().len_utf8() +/// }; +/// let steps = core::iter::from_fn(|| { +/// match searcher.next_fwd(advance) { +/// SearchStep::Done => None, +/// step => Some(step) +/// } +/// }).collect::>(); +/// assert_eq!(&[ +/// SearchStep::Match(0, 0), +/// SearchStep::Reject(0, 1), +/// SearchStep::Match(1, 1), +/// SearchStep::Reject(1, 3), +/// SearchStep::Match(3, 3), +/// SearchStep::Reject(3, 5), +/// SearchStep::Match(5, 5), +/// ], steps.as_slice()); +/// ``` +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct EmptyNeedleSearcher { + start: T, + end: T, + is_match_fwd: bool, + is_match_bwd: bool, + // Needed in case of an empty haystack, see #85462 + is_finished: bool, +} + +impl EmptyNeedleSearcher { + /// Creates a new empty needle searcher for given haystack. + /// + /// The haystack is used to initialise the range of valid cursors positions. + pub fn new>(haystack: H) -> Self { + Self { + start: haystack.cursor_at_front(), + end: haystack.cursor_at_back(), + is_match_bwd: true, + is_match_fwd: true, + is_finished: false, + } + } + + /// Returns next search result. + /// + /// The callback function is used to advance the **start** of the range the + /// searcher is working on. It is passed the current range of cursor + /// positions that weren’t visited yet and it must return the new start + /// cursor position. It’s never called with an empty range. For some + /// haystacks the callback may be as simple as a closure returning the start + /// incremented by one; others might require looking for a new valid + /// boundary. + pub fn next_fwd, F>(&mut self, advance_fwd: F) -> R + where + F: FnOnce(crate::ops::Range) -> T, + { + if self.is_finished { + return R::DONE; + } + if take(&mut self.is_match_fwd) { + if let Some(ret) = R::matching(self.start, self.start) { + return ret; + } + } + if self.start < self.end { + let pos = self.start; + self.start = advance_fwd(self.start..self.end); + if let Some(ret) = R::rejecting(pos, self.start) { + self.is_match_fwd = true; + return ret; + } + return R::matching(self.start, self.start).unwrap(); + } + self.is_finished = true; + R::DONE + } + + /// Returns next search result. + /// + /// The callback function is used to advance the **end** of the range the + /// searcher is working on backwards. It is passed the current range of + /// cursor positions that weren’t visited yet and it must return the new end + /// cursor position. It’s never called with an empty range. For some + /// haystacks the callback may be as simple as a closure returning the end + /// decremented by one; others might require looking for a new valid + /// boundary. + pub fn next_bwd, F>(&mut self, advance_bwd: F) -> R + where + F: FnOnce(crate::ops::Range) -> T, + { + if self.is_finished { + return R::DONE; + } + if take(&mut self.is_match_bwd) { + if let Some(ret) = R::matching(self.end, self.end) { + return ret; + } + } + if self.start < self.end { + let pos = self.end; + self.end = advance_bwd(self.start..self.end); + if let Some(ret) = R::rejecting(self.end, pos) { + self.is_match_bwd = true; + return ret; + } + return R::matching(self.end, self.end).unwrap(); + } + self.is_finished = true; + R::DONE + } +} + +////////////////////////////////////////////////////////////////////////////// +// Internal Split and SplitN implementations +////////////////////////////////////////////////////////////////////////////// + +/// Helper type for implementing split iterators. +/// +/// It’s a generic type which works with any [`Haystack`] and [`Searcher`] over +/// that haystack. Intended usage is to create a newtype wrapping this type +/// which implements iterator interface on top of [`next_fwd`][Split::next_fwd] +/// or [`next_fwd`][Split::next_fwd] methods. +/// +/// Note that unless `S` implements [`DoubleEndedSearcher`] trait, it’s +/// incorrect to use this type to implement a double ended iterator. +/// +/// For an example of this type in use, see [`core::str::Split`]. +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct Split> { + /// Start of the region of the haystack yet to be examined. + start: H::Cursor, + /// End of the region of the haystack yet to be examined. + end: H::Cursor, + /// Searcher returning matches of the delimiter pattern. + searcher: S, + /// Whether to return an empty part if there’s delimiter at the end of the + /// haystack. + allow_trailing_empty: bool, + /// Whether splitting has finished. + finished: bool, +} + +/// Helper type for implementing split iterators with a split limit. +/// +/// It’s like [`Split`] but limits number of parts the haystack will be split +/// into. +#[unstable(feature = "pattern_internals", issue = "none")] +pub struct SplitN> { + /// Inner split implementation. + inner: Split, + /// Maximum number of parts the haystack can be split into. + limit: usize, +} + +impl + Clone> Clone for Split { + fn clone(&self) -> Self { + Self { searcher: self.searcher.clone(), ..*self } + } +} + +impl + Clone> Clone for SplitN { + fn clone(&self) -> Self { + Self { inner: self.inner.clone(), ..*self } + } +} + +impl fmt::Debug for Split +where + H: Haystack, + S: Searcher + fmt::Debug, +{ + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Split") + .field("start", &self.start) + .field("end", &self.end) + .field("searcher", &self.searcher) + .field("allow_trailing_empty", &self.allow_trailing_empty) + .field("finished", &self.finished) + .finish() + } +} + +impl fmt::Debug for SplitN +where + H: Haystack, + S: Searcher + fmt::Debug, +{ + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("SplitN").field("inner", &self.inner).field("limit", &self.limit).finish() + } +} + +impl> Split { + /// Creates a new object configured without a limit and with + /// `allow_trailing_empty` option disabled. + /// + /// To set `allow_trailing_empty`, use + /// [`with_allow_trailing_empty()`][Self::with_allow_trailing_empty] method. + /// To set split limit, use [`with_limit()`][Self::with_limit] method. + pub fn new(searcher: S) -> Self { + let haystack = searcher.haystack(); + Self { + searcher, + start: haystack.cursor_at_front(), + end: haystack.cursor_at_back(), + allow_trailing_empty: false, + finished: false, + } + } + + /// Changes splits limit from unlimited to given value. + /// + /// The limit specifies maximum number of parts haystack will be split into. + pub fn with_limit(self, limit: usize) -> SplitN { + SplitN { inner: self, limit } + } + + /// Enables allow_trailing_empty option. + /// + /// If enabled (which is not the default), if the haystack is empty or + /// terminated by a pattern match, the last haystack part returned will be + /// empty. Otherwise, the last empty split is not returned. + pub fn with_allow_trailing_empty(mut self) -> Self { + self.allow_trailing_empty = true; + self + } +} + +impl> Split { + /// Returns next part of the haystack or `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_fwd(&mut self) -> Option { + if self.finished { + return None; + } + let haystack = self.searcher.haystack(); + if let Some((start, end)) = self.searcher.next_match() { + let range = self.start..(if INCLUSIVE { end } else { start }); + self.start = end; + // SAFETY: self.start and self.end come from Haystack or Searcher + // and thus are guaranteed to be valid split positions. + Some(unsafe { haystack.get_unchecked(range) }) + } else { + self.get_end() + } + } + + /// Returns next looking from back of the haystack part of the haystack or + /// `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_bwd(&mut self) -> Option + where + S: ReverseSearcher, + { + if self.finished { + return None; + } + + if !self.allow_trailing_empty { + self.allow_trailing_empty = true; + if let Some(elt) = self.next_bwd::() { + if !elt.is_empty() { + return Some(elt); + } + } + if self.finished { + return None; + } + } + + let range = if let Some((start, end)) = self.searcher.next_match_back() { + end..replace(&mut self.end, if INCLUSIVE { end } else { start }) + } else { + self.finished = true; + self.start..self.end + }; + // SAFETY: All indices come from Haystack or Searcher which guarantee + // that they are valid split positions. + Some(unsafe { self.searcher.haystack().get_unchecked(range) }) + } + + /// Returns remaining part of the haystack that hasn’t been processed yet. + pub fn remainder(&self) -> Option { + (!self.finished).then(|| { + // SAFETY: self.start and self.end come from Haystack or Searcher + // and thus are guaranteed to be valid split positions. + unsafe { self.searcher.haystack().get_unchecked(self.start..self.end) } + }) + } + + /// Returns the final haystack part. + /// + /// Sets `finished` flag so any further calls to this or other methods will + /// return `None`. + fn get_end(&mut self) -> Option { + if !self.finished { + self.finished = true; + if self.allow_trailing_empty || self.start != self.end { + // SAFETY: self.start and self.end come from Haystack or + // Searcher and thus are guaranteed to be valid split positions. + return Some(unsafe { + self.searcher.haystack().get_unchecked(self.start..self.end) + }); + } + } + None + } +} + +impl> SplitN { + /// Returns next part of the haystack or `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_fwd(&mut self) -> Option { + match self.dec_limit()? { + 0 => self.inner.get_end(), + _ => self.inner.next_fwd::(), + } + } + + /// Returns next looking from back of the haystack part of the haystack or + /// `None` if splitting is done. + /// + /// If `INCLUSIVE` is `true`, returned value will include the matching + /// pattern. + pub fn next_bwd(&mut self) -> Option + where + S: ReverseSearcher, + { + match self.dec_limit()? { + 0 => self.inner.get_end(), + _ => self.inner.next_bwd::(), + } + } + + /// Returns remaining part of the haystack that hasn’t been processed yet. + pub fn remainder(&self) -> Option { + self.inner.remainder() + } + + /// Decrements limit and returns its new value or None if it’s already zero. + fn dec_limit(&mut self) -> Option { + self.limit = self.limit.checked_sub(1)?; + Some(self.limit) + } +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index 772c3605562cf..a3016194dc001 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -7,11 +7,10 @@ use crate::iter::{Copied, Filter, FusedIterator, Map, TrustedLen}; use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce}; use crate::ops::Try; use crate::option; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; -use super::pattern::Pattern; -use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; use super::validations::{next_code_point, next_code_point_reverse}; use super::LinesMap; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; @@ -361,7 +360,7 @@ macro_rules! derive_pattern_clone { (clone $t:ident with |$s:ident| $e:expr) => { impl<'a, P> Clone for $t<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { let $s = self; @@ -374,7 +373,7 @@ macro_rules! derive_pattern_clone { /// This macro generates two public iterator structs /// wrapping a private internal one that makes use of the `Pattern` API. /// -/// For all patterns `P: Pattern<'a>` the following items will be +/// For all patterns `P: Pattern<&'a str>` the following items will be /// generated (generics omitted): /// /// struct $forward_iterator($internal_iterator); @@ -434,12 +433,14 @@ macro_rules! generate_pattern_iterators { } => { $(#[$forward_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $forward_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $forward_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($forward_iterator)) @@ -449,7 +450,7 @@ macro_rules! generate_pattern_iterators { } $(#[$common_stability_attribute])* - impl<'a, P: Pattern<'a>> Iterator for $forward_iterator<'a, P> { + impl<'a, P: Pattern<&'a str>> Iterator for $forward_iterator<'a, P> { type Item = $iterty; #[inline] @@ -461,7 +462,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $forward_iterator(self.0.clone()) @@ -470,12 +471,14 @@ macro_rules! generate_pattern_iterators { $(#[$reverse_iterator_attribute])* $(#[$common_stability_attribute])* - pub struct $reverse_iterator<'a, P: Pattern<'a>>(pub(super) $internal_iterator<'a, P>); + pub struct $reverse_iterator<'a, P: Pattern<&'a str>>( + pub(super) $internal_iterator<'a, P> + ); $(#[$common_stability_attribute])* impl<'a, P> fmt::Debug for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple(stringify!($reverse_iterator)) @@ -487,7 +490,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Iterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { type Item = $iterty; @@ -500,7 +503,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> Clone for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: Clone>, + P: Pattern<&'a str, Searcher: Clone>, { fn clone(&self) -> Self { $reverse_iterator(self.0.clone()) @@ -508,12 +511,12 @@ macro_rules! generate_pattern_iterators { } #[stable(feature = "fused", since = "1.26.0")] - impl<'a, P: Pattern<'a>> FusedIterator for $forward_iterator<'a, P> {} + impl<'a, P: Pattern<&'a str>> FusedIterator for $forward_iterator<'a, P> {} #[stable(feature = "fused", since = "1.26.0")] impl<'a, P> FusedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, {} generate_pattern_iterators!($($t)* with $(#[$common_stability_attribute])*, @@ -528,7 +531,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $forward_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -539,7 +542,7 @@ macro_rules! generate_pattern_iterators { $(#[$common_stability_attribute])* impl<'a, P> DoubleEndedIterator for $reverse_iterator<'a, P> where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { #[inline] fn next_back(&mut self) -> Option<$iterty> { @@ -554,177 +557,64 @@ macro_rules! generate_pattern_iterators { } => {} } -derive_pattern_clone! { - clone SplitInternal - with |s| SplitInternal { matcher: s.matcher.clone(), ..*s } +pub(super) struct SplitInternal<'a, P: Pattern<&'a str>>( + core::pattern::Split<&'a str, P::Searcher>, +); + +impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { + pub(super) fn new(haystack: &'a str, pattern: P) -> Self { + Self(core::pattern::Split::new(pattern.into_searcher(haystack))) + } + + pub(super) fn with_allow_trailing_empty(self) -> Self { + Self(self.0.with_allow_trailing_empty()) + } + + pub(super) fn with_limit(self, count: usize) -> SplitNInternal<'a, P> { + SplitNInternal(self.0.with_limit(count)) + } } -pub(super) struct SplitInternal<'a, P: Pattern<'a>> { - pub(super) start: usize, - pub(super) end: usize, - pub(super) matcher: P::Searcher, - pub(super) allow_trailing_empty: bool, - pub(super) finished: bool, +impl<'a, P: Pattern<&'a str, Searcher: Clone>> Clone for SplitInternal<'a, P> { + fn clone(&self) -> Self { + Self(self.0.clone()) + } } impl<'a, P> fmt::Debug for SplitInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitInternal") - .field("start", &self.start) - .field("end", &self.end) - .field("matcher", &self.matcher) - .field("allow_trailing_empty", &self.allow_trailing_empty) - .field("finished", &self.finished) - .finish() + self.0.fmt(f) } } -impl<'a, P: Pattern<'a>> SplitInternal<'a, P> { - #[inline] - fn get_end(&mut self) -> Option<&'a str> { - if !self.finished { - self.finished = true; - - if self.allow_trailing_empty || self.end - self.start > 0 { - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - let string = unsafe { self.matcher.haystack().get_unchecked(self.start..self.end) }; - return Some(string); - } - } - - None - } - - #[inline] +impl<'a, P: Pattern<&'a str>> SplitInternal<'a, P> { fn next(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..a); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } + self.0.next_fwd::() } - #[inline] fn next_inclusive(&mut self) -> Option<&'a str> { - if self.finished { - return None; - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.start is either the start of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(self.start..b); - self.start = b; - Some(elt) - }, - None => self.get_end(), - } + self.0.next_fwd::() } - #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `a` and `b` lie on unicode boundaries. - Some((a, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = a; - Some(elt) - }, - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } + self.0.next_bwd::() } - #[inline] fn next_back_inclusive(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { - if self.finished { - return None; - } - - if !self.allow_trailing_empty { - self.allow_trailing_empty = true; - match self.next_back_inclusive() { - Some(elt) if !elt.is_empty() => return Some(elt), - _ => { - if self.finished { - return None; - } - } - } - } - - let haystack = self.matcher.haystack(); - match self.matcher.next_match_back() { - // SAFETY: `Searcher` guarantees that `b` lies on unicode boundary, - // and self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - Some((_, b)) => unsafe { - let elt = haystack.get_unchecked(b..self.end); - self.end = b; - Some(elt) - }, - // SAFETY: self.start is either the start of the original string, - // or start of a substring that represents the part of the string that hasn't - // iterated yet. Either way, it is guaranteed to lie on unicode boundary. - // self.end is either the end of the original string, - // or `b` was assigned to it, so it also lies on unicode boundary. - None => unsafe { - self.finished = true; - Some(haystack.get_unchecked(self.start..self.end)) - }, - } + self.0.next_bwd::() } - #[inline] fn remainder(&self) -> Option<&'a str> { - // `Self::get_end` doesn't change `self.start` - if self.finished { - return None; - } - - // SAFETY: `self.start` and `self.end` always lie on unicode boundaries. - Some(unsafe { self.matcher.haystack().get_unchecked(self.start..self.end) }) + self.0.remainder() } } @@ -746,7 +636,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> Split<'a, P> { +impl<'a, P: Pattern<&'a str>> Split<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -769,7 +659,7 @@ impl<'a, P: Pattern<'a>> Split<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplit<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplit<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -810,7 +700,7 @@ generate_pattern_iterators! { delegate double ended; } -impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -833,7 +723,7 @@ impl<'a, P: Pattern<'a>> SplitTerminator<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitTerminator<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitTerminator<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -858,64 +748,39 @@ impl<'a, P: Pattern<'a>> RSplitTerminator<'a, P> { derive_pattern_clone! { clone SplitNInternal - with |s| SplitNInternal { iter: s.iter.clone(), ..*s } + with |s| SplitNInternal(s.0.clone()) } -pub(super) struct SplitNInternal<'a, P: Pattern<'a>> { - pub(super) iter: SplitInternal<'a, P>, - /// The number of splits remaining - pub(super) count: usize, -} +pub(super) struct SplitNInternal<'a, P: Pattern<&'a str>>( + core::pattern::SplitN<&'a str, P::Searcher>, +); impl<'a, P> fmt::Debug for SplitNInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SplitNInternal") - .field("iter", &self.iter) - .field("count", &self.count) - .finish() + self.0.fmt(f) } } -impl<'a, P: Pattern<'a>> SplitNInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitNInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next() - } - } + self.0.next_fwd::() } #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { - match self.count { - 0 => None, - 1 => { - self.count = 0; - self.iter.get_end() - } - _ => { - self.count -= 1; - self.iter.next_back() - } - } + self.0.next_bwd::() } #[inline] fn remainder(&self) -> Option<&'a str> { - self.iter.remainder() + self.0.remainder() } } @@ -937,7 +802,7 @@ generate_pattern_iterators! { delegate single ended; } -impl<'a, P: Pattern<'a>> SplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -960,7 +825,7 @@ impl<'a, P: Pattern<'a>> SplitN<'a, P> { } } -impl<'a, P: Pattern<'a>> RSplitN<'a, P> { +impl<'a, P: Pattern<&'a str>> RSplitN<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. @@ -988,18 +853,18 @@ derive_pattern_clone! { with |s| MatchIndicesInternal(s.0.clone()) } -pub(super) struct MatchIndicesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchIndicesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchIndicesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchIndicesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchIndicesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<(usize, &'a str)> { self.0 @@ -1011,7 +876,7 @@ impl<'a, P: Pattern<'a>> MatchIndicesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<(usize, &'a str)> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { self.0 .next_match_back() @@ -1043,18 +908,18 @@ derive_pattern_clone! { with |s| MatchesInternal(s.0.clone()) } -pub(super) struct MatchesInternal<'a, P: Pattern<'a>>(pub(super) P::Searcher); +pub(super) struct MatchesInternal<'a, P: Pattern<&'a str>>(pub(super) P::Searcher); impl<'a, P> fmt::Debug for MatchesInternal<'a, P> where - P: Pattern<'a, Searcher: fmt::Debug>, + P: Pattern<&'a str, Searcher: fmt::Debug>, { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("MatchesInternal").field(&self.0).finish() } } -impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { +impl<'a, P: Pattern<&'a str>> MatchesInternal<'a, P> { #[inline] fn next(&mut self) -> Option<&'a str> { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. @@ -1067,7 +932,7 @@ impl<'a, P: Pattern<'a>> MatchesInternal<'a, P> { #[inline] fn next_back(&mut self) -> Option<&'a str> where - P::Searcher: ReverseSearcher<'a>, + P::Searcher: ReverseSearcher<&'a str>, { // SAFETY: `Searcher` guarantees that `start` and `end` lie on unicode boundaries. self.0.next_match_back().map(|(a, b)| unsafe { @@ -1213,7 +1078,7 @@ pub struct SplitAsciiWhitespace<'a> { /// /// [`split_inclusive`]: str::split_inclusive #[stable(feature = "split_inclusive", since = "1.51.0")] -pub struct SplitInclusive<'a, P: Pattern<'a>>(pub(super) SplitInternal<'a, P>); +pub struct SplitInclusive<'a, P: Pattern<&'a str>>(pub(super) SplitInternal<'a, P>); #[stable(feature = "split_whitespace", since = "1.1.0")] impl<'a> Iterator for SplitWhitespace<'a> { @@ -1335,7 +1200,7 @@ impl<'a> SplitAsciiWhitespace<'a> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> Iterator for SplitInclusive<'a, P> { type Item = &'a str; #[inline] @@ -1345,7 +1210,7 @@ impl<'a, P: Pattern<'a>> Iterator for SplitInclusive<'a, P> { } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, P> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitInclusive").field("0", &self.0).finish() } @@ -1353,14 +1218,14 @@ impl<'a, P: Pattern<'a, Searcher: fmt::Debug>> fmt::Debug for SplitInclusive<'a, // FIXME(#26925) Remove in favor of `#[derive(Clone)]` #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: Clone>> Clone for SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str, Searcher: Clone>> Clone for SplitInclusive<'a, P> { fn clone(&self) -> Self { SplitInclusive(self.0.clone()) } } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator +impl<'a, P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>> DoubleEndedIterator for SplitInclusive<'a, P> { #[inline] @@ -1370,9 +1235,9 @@ impl<'a, P: Pattern<'a, Searcher: ReverseSearcher<'a>>> DoubleEndedIterator } #[stable(feature = "split_inclusive", since = "1.51.0")] -impl<'a, P: Pattern<'a>> FusedIterator for SplitInclusive<'a, P> {} +impl<'a, P: Pattern<&'a str>> FusedIterator for SplitInclusive<'a, P> {} -impl<'a, P: Pattern<'a>> SplitInclusive<'a, P> { +impl<'a, P: Pattern<&'a str>> SplitInclusive<'a, P> { /// Returns remainder of the split string. /// /// If the iterator is empty, returns `None`. diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 66fa9cf6f64c0..594e07a266141 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -13,12 +13,10 @@ mod iter; mod traits; mod validations; -use self::pattern::Pattern; -use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; - use crate::ascii; use crate::char::{self, EscapeDebugExtArgs}; use crate::mem; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; use crate::slice::{self, SliceIndex}; pub mod pattern; @@ -70,12 +68,15 @@ pub use iter::SplitAsciiWhitespace; #[stable(feature = "split_inclusive", since = "1.51.0")] pub use iter::SplitInclusive; +pub(crate) use validations::next_code_point_reverse; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::{next_code_point, utf8_char_width}; +pub use validations::{ + next_code_point, try_next_code_point, try_next_code_point_reverse, utf8_char_width, +}; use iter::MatchIndicesInternal; +use iter::MatchesInternal; use iter::SplitInternal; -use iter::{MatchesInternal, SplitNInternal}; #[inline(never)] #[cold] @@ -1060,7 +1061,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn contains<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_contained_in(self) } @@ -1086,7 +1087,7 @@ impl str { /// assert!(!bananas.starts_with("nana")); /// ``` #[stable(feature = "rust1", since = "1.0.0")] - pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool { + pub fn starts_with<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> bool { pat.is_prefix_of(self) } @@ -1114,7 +1115,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn ends_with<'a, P>(&'a self, pat: P) -> bool where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.is_suffix_of(self) } @@ -1163,7 +1164,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option { + pub fn find<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Option { pat.into_searcher(self).next_match().map(|(i, _)| i) } @@ -1211,7 +1212,7 @@ impl str { #[inline] pub fn rfind<'a, P>(&'a self, pat: P) -> Option where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { pat.into_searcher(self).next_match_back().map(|(i, _)| i) } @@ -1331,14 +1332,8 @@ impl str { /// [`split_whitespace`]: str::split_whitespace #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> { - Split(SplitInternal { - start: 0, - end: self.len(), - matcher: pat.into_searcher(self), - allow_trailing_empty: true, - finished: false, - }) + pub fn split<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Split<'a, P> { + Split(SplitInternal::new(self, pat).with_allow_trailing_empty()) } /// An iterator over substrings of this string slice, separated by @@ -1371,14 +1366,8 @@ impl str { /// ``` #[stable(feature = "split_inclusive", since = "1.51.0")] #[inline] - pub fn split_inclusive<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitInclusive<'a, P> { - SplitInclusive(SplitInternal { - start: 0, - end: self.len(), - matcher: pat.into_searcher(self), - allow_trailing_empty: false, - finished: false, - }) + pub fn split_inclusive<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitInclusive<'a, P> { + SplitInclusive(SplitInternal::new(self, pat)) } /// An iterator over substrings of the given string slice, separated by @@ -1428,7 +1417,7 @@ impl str { #[inline] pub fn rsplit<'a, P>(&'a self, pat: P) -> RSplit<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplit(self.split(pat).0) } @@ -1477,8 +1466,8 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> { - SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 }) + pub fn split_terminator<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> SplitTerminator<'a, P> { + SplitTerminator(SplitInternal::new(self, pat)) } /// An iterator over substrings of `self`, separated by characters @@ -1525,7 +1514,7 @@ impl str { #[inline] pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> RSplitTerminator<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitTerminator(self.split_terminator(pat).0) } @@ -1578,8 +1567,8 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] #[inline] - pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { - SplitN(SplitNInternal { iter: self.split(pat).0, count: n }) + pub fn splitn<'a, P: Pattern<&'a str>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> { + SplitN(self.split(pat).0.with_limit(n)) } /// An iterator over substrings of this string slice, separated by a @@ -1629,7 +1618,7 @@ impl str { #[inline] pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> RSplitN<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RSplitN(self.splitn(n, pat).0) } @@ -1647,7 +1636,10 @@ impl str { /// ``` #[stable(feature = "str_split_once", since = "1.52.0")] #[inline] - pub fn split_once<'a, P: Pattern<'a>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> { + pub fn split_once<'a, P: Pattern<&'a str>>( + &'a self, + delimiter: P, + ) -> Option<(&'a str, &'a str)> { let (start, end) = delimiter.into_searcher(self).next_match()?; // SAFETY: `Searcher` is known to return valid indices. unsafe { Some((self.get_unchecked(..start), self.get_unchecked(end..))) } @@ -1667,7 +1659,7 @@ impl str { #[inline] pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let (start, end) = delimiter.into_searcher(self).next_match_back()?; // SAFETY: `Searcher` is known to return valid indices. @@ -1707,7 +1699,7 @@ impl str { /// ``` #[stable(feature = "str_matches", since = "1.2.0")] #[inline] - pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> { + pub fn matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> Matches<'a, P> { Matches(MatchesInternal(pat.into_searcher(self))) } @@ -1745,7 +1737,7 @@ impl str { #[inline] pub fn rmatches<'a, P>(&'a self, pat: P) -> RMatches<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatches(self.matches(pat).0) } @@ -1789,7 +1781,7 @@ impl str { /// ``` #[stable(feature = "str_match_indices", since = "1.5.0")] #[inline] - pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> { + pub fn match_indices<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> MatchIndices<'a, P> { MatchIndices(MatchIndicesInternal(pat.into_searcher(self))) } @@ -1833,7 +1825,7 @@ impl str { #[inline] pub fn rmatch_indices<'a, P>(&'a self, pat: P) -> RMatchIndices<'a, P> where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { RMatchIndices(self.match_indices(pat).0) } @@ -2050,7 +2042,7 @@ impl str { #[stable(feature = "rust1", since = "1.0.0")] pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>, + P: Pattern<&'a str, Searcher: DoubleEndedSearcher<&'a str>>, { let mut i = 0; let mut j = 0; @@ -2097,7 +2089,7 @@ impl str { #[must_use = "this returns the trimmed string as a new slice, \ without modifying the original"] #[stable(feature = "trim_direction", since = "1.30.0")] - pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_start_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { let mut i = self.len(); let mut matcher = pat.into_searcher(self); if let Some((a, _)) = matcher.next_reject() { @@ -2130,7 +2122,7 @@ impl str { #[must_use = "this returns the remaining substring as a new slice, \ without modifying the original"] #[stable(feature = "str_strip", since = "1.45.0")] - pub fn strip_prefix<'a, P: Pattern<'a>>(&'a self, prefix: P) -> Option<&'a str> { + pub fn strip_prefix<'a, P: Pattern<&'a str>>(&'a self, prefix: P) -> Option<&'a str> { prefix.strip_prefix_of(self) } @@ -2159,8 +2151,8 @@ impl str { #[stable(feature = "str_strip", since = "1.45.0")] pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a str> where - P: Pattern<'a>, -

>::Searcher: ReverseSearcher<'a>, + P: Pattern<&'a str>, +

>::Searcher: ReverseSearcher<&'a str>, { suffix.strip_suffix_of(self) } @@ -2203,7 +2195,7 @@ impl str { #[stable(feature = "trim_direction", since = "1.30.0")] pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { let mut j = 0; let mut matcher = pat.into_searcher(self); @@ -2247,7 +2239,7 @@ impl str { note = "superseded by `trim_start_matches`", suggestion = "trim_start_matches" )] - pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str { + pub fn trim_left_matches<'a, P: Pattern<&'a str>>(&'a self, pat: P) -> &'a str { self.trim_start_matches(pat) } @@ -2292,7 +2284,7 @@ impl str { )] pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a str where - P: Pattern<'a, Searcher: ReverseSearcher<'a>>, + P: Pattern<&'a str, Searcher: ReverseSearcher<&'a str>>, { self.trim_end_matches(pat) } diff --git a/library/core/src/str/pattern.rs b/library/core/src/str/pattern.rs index e3a464a1c51a9..ca53e1ae8268b 100644 --- a/library/core/src/str/pattern.rs +++ b/library/core/src/str/pattern.rs @@ -1,36 +1,47 @@ -//! The string Pattern API. +//! [The Pattern API] implementation for searching in `&str`. //! -//! The Pattern API provides a generic mechanism for using different pattern -//! types when searching through a string. +//! The implementation provides generic mechanism for using different pattern +//! types when searching through a string. Although this API is unstable, it is +//! exposed via stable APIs on the [`str`] type. //! -//! For more details, see the traits [`Pattern`], [`Searcher`], -//! [`ReverseSearcher`], and [`DoubleEndedSearcher`]. +//! Depending on the type of the pattern, the behaviour of methods like +//! [`str::find`] and [`str::contains`] can change. The table below describes +//! some of those behaviours. //! -//! Although this API is unstable, it is exposed via stable APIs on the -//! [`str`] type. +//! | Pattern type | Match condition | +//! |--------------------------|-------------------------------------------| +//! | `&str` | is substring | +//! | `char` | is contained in string | +//! | `&[char]` | any char in slice is contained in string | +//! | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | +//! | `&&str` | is substring | +//! | `&String` | is substring | //! //! # Examples //! -//! [`Pattern`] is [implemented][pattern-impls] in the stable API for -//! [`&str`][`str`], [`char`], slices of [`char`], and functions and closures -//! implementing `FnMut(char) -> bool`. -//! //! ``` //! let s = "Can you find a needle in a haystack?"; //! //! // &str pattern //! assert_eq!(s.find("you"), Some(4)); +//! assert_eq!(s.find("thou"), None); +//! //! // char pattern //! assert_eq!(s.find('n'), Some(2)); -//! // array of chars pattern +//! assert_eq!(s.find('N'), None); +//! +//! // Array of chars pattern and slices thereof //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u']), Some(1)); -//! // slice of chars pattern //! assert_eq!(s.find(&['a', 'e', 'i', 'o', 'u'][..]), Some(1)); -//! // closure pattern +//! assert_eq!(s.find(&['q', 'v', 'x']), None); +//! +//! // Predicate closure //! assert_eq!(s.find(|c: char| c.is_ascii_punctuation()), Some(35)); +//! assert_eq!(s.find(|c: char| c.is_lowercase()), Some(1)); +//! assert_eq!(s.find(|c: char| !c.is_ascii()), None); //! ``` //! -//! [pattern-impls]: Pattern#implementors +//! [The Pattern API]: crate::pattern #![unstable( feature = "pattern", @@ -38,496 +49,91 @@ issue = "27721" )] -use crate::cmp; use crate::cmp::Ordering; use crate::fmt; -use crate::slice::memchr; - -// Pattern +use crate::ops::Range; +use crate::pattern::{ + DoubleEndedSearcher, Haystack, Pattern, ReverseSearcher, SearchStep, Searcher, +}; +use crate::str_bytes; -/// A string pattern. -/// -/// A `Pattern<'a>` expresses that the implementing type -/// can be used as a string pattern for searching in a [`&'a str`][str]. -/// -/// For example, both `'a'` and `"aa"` are patterns that -/// would match at index `1` in the string `"baaaab"`. -/// -/// The trait itself acts as a builder for an associated -/// [`Searcher`] type, which does the actual work of finding -/// occurrences of the pattern in a string. -/// -/// Depending on the type of the pattern, the behaviour of methods like -/// [`str::find`] and [`str::contains`] can change. The table below describes -/// some of those behaviours. -/// -/// | Pattern type | Match condition | -/// |--------------------------|-------------------------------------------| -/// | `&str` | is substring | -/// | `char` | is contained in string | -/// | `&[char]` | any char in slice is contained in string | -/// | `F: FnMut(char) -> bool` | `F` returns `true` for a char in string | -/// | `&&str` | is substring | -/// | `&String` | is substring | -/// -/// # Examples -/// -/// ``` -/// // &str -/// assert_eq!("abaaa".find("ba"), Some(1)); -/// assert_eq!("abaaa".find("bac"), None); -/// -/// // char -/// assert_eq!("abaaa".find('a'), Some(0)); -/// assert_eq!("abaaa".find('b'), Some(1)); -/// assert_eq!("abaaa".find('c'), None); -/// -/// // &[char; N] -/// assert_eq!("ab".find(&['b', 'a']), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z']), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd']), None); -/// -/// // &[char] -/// assert_eq!("ab".find(&['b', 'a'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['a', 'z'][..]), Some(0)); -/// assert_eq!("abaaa".find(&['c', 'd'][..]), None); -/// -/// // FnMut(char) -> bool -/// assert_eq!("abcdef_z".find(|ch| ch > 'd' && ch < 'y'), Some(4)); -/// assert_eq!("abcddd_z".find(|ch| ch > 'd' && ch < 'y'), None); -/// ``` -pub trait Pattern<'a>: Sized { - /// Associated searcher for this pattern - type Searcher: Searcher<'a>; +///////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +///////////////////////////////////////////////////////////////////////////// - /// Constructs the associated searcher from - /// `self` and the `haystack` to search in. - fn into_searcher(self, haystack: &'a str) -> Self::Searcher; +impl<'a> Haystack for &'a str { + type Cursor = usize; - /// Checks whether the pattern matches anywhere in the haystack - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - self.into_searcher(haystack).next_match().is_some() + #[inline(always)] + fn cursor_at_front(self) -> usize { + 0 } - - /// Checks whether the pattern matches at the front of the haystack - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _)) + #[inline(always)] + fn cursor_at_back(self) -> usize { + self.len() } - /// Checks whether the pattern matches at the back of the haystack - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where - Self::Searcher: ReverseSearcher<'a>, - { - matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j) + #[inline(always)] + fn is_empty(self) -> bool { + self.is_empty() } - /// Removes the pattern from the front of haystack, if it matches. - #[inline] - fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() { - debug_assert_eq!( - start, 0, - "The first search step from Searcher \ - must include the first character" - ); - // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(len..)) } - } else { - None - } + #[inline(always)] + unsafe fn get_unchecked(self, range: Range) -> Self { + // SAFETY: Caller promises position is a character boundary. + unsafe { self.get_unchecked(range) } } - - /// Removes the pattern from the back of haystack, if it matches. - #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> - where - Self::Searcher: ReverseSearcher<'a>, - { - if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() { - debug_assert_eq!( - end, - haystack.len(), - "The first search step from ReverseSearcher \ - must include the last character" - ); - // SAFETY: `Searcher` is known to return valid indices. - unsafe { Some(haystack.get_unchecked(..start)) } - } else { - None - } - } -} - -// Searcher - -/// Result of calling [`Searcher::next()`] or [`ReverseSearcher::next_back()`]. -#[derive(Copy, Clone, Eq, PartialEq, Debug)] -pub enum SearchStep { - /// Expresses that a match of the pattern has been found at - /// `haystack[a..b]`. - Match(usize, usize), - /// Expresses that `haystack[a..b]` has been rejected as a possible match - /// of the pattern. - /// - /// Note that there might be more than one `Reject` between two `Match`es, - /// there is no requirement for them to be combined into one. - Reject(usize, usize), - /// Expresses that every byte of the haystack has been visited, ending - /// the iteration. - Done, } -/// A searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the front (left) of a string. -/// -/// It will be implemented by associated `Searcher` -/// types of the [`Pattern`] trait. -/// -/// The trait is marked unsafe because the indices returned by the -/// [`next()`][Searcher::next] methods are required to lie on valid utf8 -/// boundaries in the haystack. This enables consumers of this trait to -/// slice the haystack without additional runtime checks. -pub unsafe trait Searcher<'a> { - /// Getter for the underlying string to be searched in - /// - /// Will always return the same [`&str`][str]. - fn haystack(&self) -> &'a str; - - /// Performs the next search step starting from the front. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` matches - /// the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` can - /// not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack has - /// been visited. - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]` - fn next(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. See [`next()`][Searcher::next]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_reject`][Searcher::next_reject] will overlap. This will return - /// `(start_match, end_match)`, where start_match is the index of where - /// the match begins, and end_match is the index after the end of the match. - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. See [`next()`][Searcher::next] - /// and [`next_match()`][Searcher::next_match]. - /// - /// Unlike [`next()`][Searcher::next], there is no guarantee that the returned ranges - /// of this and [`next_match`][Searcher::next_match] will overlap. - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - loop { - match self.next() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A reverse searcher for a string pattern. -/// -/// This trait provides methods for searching for non-overlapping -/// matches of a pattern starting from the back (right) of a string. -/// -/// It will be implemented by associated [`Searcher`] -/// types of the [`Pattern`] trait if the pattern supports searching -/// for it from the back. -/// -/// The index ranges returned by this trait are not required -/// to exactly match those of the forward search in reverse. -/// -/// For the reason why this trait is marked unsafe, see the -/// parent trait [`Searcher`]. -pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { - /// Performs the next search step starting from the back. - /// - /// - Returns [`Match(a, b)`][SearchStep::Match] if `haystack[a..b]` - /// matches the pattern. - /// - Returns [`Reject(a, b)`][SearchStep::Reject] if `haystack[a..b]` - /// can not match the pattern, even partially. - /// - Returns [`Done`][SearchStep::Done] if every byte of the haystack - /// has been visited - /// - /// The stream of [`Match`][SearchStep::Match] and - /// [`Reject`][SearchStep::Reject] values up to a [`Done`][SearchStep::Done] - /// will contain index ranges that are adjacent, non-overlapping, - /// covering the whole haystack, and laying on utf8 boundaries. - /// - /// A [`Match`][SearchStep::Match] result needs to contain the whole matched - /// pattern, however [`Reject`][SearchStep::Reject] results may be split up - /// into arbitrary many adjacent fragments. Both ranges may have zero length. - /// - /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"` - /// might produce the stream - /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`. - fn next_back(&mut self) -> SearchStep; - - /// Finds the next [`Match`][SearchStep::Match] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } - - /// Finds the next [`Reject`][SearchStep::Reject] result. - /// See [`next_back()`][ReverseSearcher::next_back]. - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - loop { - match self.next_back() { - SearchStep::Reject(a, b) => return Some((a, b)), - SearchStep::Done => return None, - _ => continue, - } - } - } -} - -/// A marker trait to express that a [`ReverseSearcher`] -/// can be used for a [`DoubleEndedIterator`] implementation. -/// -/// For this, the impl of [`Searcher`] and [`ReverseSearcher`] need -/// to follow these conditions: -/// -/// - All results of `next()` need to be identical -/// to the results of `next_back()` in reverse order. -/// - `next()` and `next_back()` need to behave as -/// the two ends of a range of values, that is they -/// can not "walk past each other". -/// -/// # Examples -/// -/// `char::Searcher` is a `DoubleEndedSearcher` because searching for a -/// [`char`] only requires looking at one at a time, which behaves the same -/// from both ends. -/// -/// `(&str)::Searcher` is not a `DoubleEndedSearcher` because -/// the pattern `"aa"` in the haystack `"aaa"` matches as either -/// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. -pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} - ///////////////////////////////////////////////////////////////////////////// // Impl for char ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a> { - haystack: &'a str, - // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` - // This invariant can be broken *within* next_match and next_match_back, however - // they must exit with fingers on valid code point boundaries. - /// `finger` is the current byte index of the forward search. - /// Imagine that it exists before the byte at its index, i.e. - /// `haystack[finger]` is the first byte of the slice we must inspect during - /// forward searching - finger: usize, - /// `finger_back` is the current byte index of the reverse search. - /// Imagine that it exists after the byte at its index, i.e. - /// haystack[finger_back - 1] is the last byte of the slice we must inspect during - /// forward searching (and thus the first byte to be inspected when calling next_back()). - finger_back: usize, - /// The character being searched for - needle: char, - - // safety invariant: `utf8_size` must be less than 5 - /// The number of bytes `needle` takes up when encoded in utf8. - utf8_size: usize, - /// A utf8 encoded copy of the `needle` - utf8_encoded: [u8; 4], +pub struct CharSearcher<'a>(str_bytes::CharSearcher<'a, str_bytes::Utf8>); + +impl<'a> CharSearcher<'a> { + fn new(haystack: &'a str, chr: char) -> Self { + Self(str_bytes::CharSearcher::new(str_bytes::Bytes::from(haystack), chr)) + } } -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { +unsafe impl<'a> Searcher<&'a str> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { - self.haystack + self.0.haystack().into() } #[inline] fn next(&mut self) -> SearchStep { - let old_finger = self.finger; - // SAFETY: 1-4 guarantee safety of `get_unchecked` - // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries - // (this is invariant) - // 2. `self.finger >= 0` since it starts at 0 and only increases - // 3. `self.finger < self.finger_back` because otherwise the char `iter` - // would return `SearchStep::Done` - // 4. `self.finger` comes before the end of the haystack because `self.finger_back` - // starts at the end and only decreases - let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next() { - // add byte offset of current character - // without re-encoding as utf-8 - self.finger += old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(old_finger, self.finger) - } else { - SearchStep::Reject(old_finger, self.finger) - } - } else { - SearchStep::Done - } + self.0.next() } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - loop { - // get the haystack after the last character found - let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?; - // the last byte of the utf8 encoded needle - // SAFETY: we have an invariant that `utf8_size < 5` - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memchr(last_byte, bytes) { - // The new finger is the index of the byte we found, - // plus one, since we memchr'd for the last byte of the character. - // - // Note that this doesn't always give us a finger on a UTF8 boundary. - // If we *didn't* find our character - // we may have indexed to the non-last byte of a 3-byte or 4-byte character. - // We can't just skip to the next valid starting byte because a character like - // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find - // the second byte when searching for the third. - // - // However, this is totally okay. While we have the invariant that - // self.finger is on a UTF8 boundary, this invariant is not relied upon - // within this method (it is relied upon in CharSearcher::next()). - // - // We only exit this method when we reach the end of the string, or if we - // find something. When we find something the `finger` will be set - // to a UTF8 boundary. - self.finger += index + 1; - if self.finger >= self.utf8_size { - let found_char = self.finger - self.utf8_size; - if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - return Some((found_char, self.finger)); - } - } - } - } else { - // found nothing, exit - self.finger = self.finger_back; - return None; - } - } + self.0.next_match() + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() } - - // let next_reject use the default implementation from the Searcher trait } -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { +unsafe impl<'a> ReverseSearcher<&'a str> for CharSearcher<'a> { #[inline] fn next_back(&mut self) -> SearchStep { - let old_finger = self.finger_back; - // SAFETY: see the comment for next() above - let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) }; - let mut iter = slice.chars(); - let old_len = iter.iter.len(); - if let Some(ch) = iter.next_back() { - // subtract byte offset of current character - // without re-encoding as utf-8 - self.finger_back -= old_len - iter.iter.len(); - if ch == self.needle { - SearchStep::Match(self.finger_back, old_finger) - } else { - SearchStep::Reject(self.finger_back, old_finger) - } - } else { - SearchStep::Done - } + self.0.next_back() } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - let haystack = self.haystack.as_bytes(); - loop { - // get the haystack up to but not including the last character searched - let bytes = haystack.get(self.finger..self.finger_back)?; - // the last byte of the utf8 encoded needle - // SAFETY: we have an invariant that `utf8_size < 5` - let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; - if let Some(index) = memchr::memrchr(last_byte, bytes) { - // we searched a slice that was offset by self.finger, - // add self.finger to recoup the original index - let index = self.finger + index; - // memrchr will return the index of the byte we wish to - // find. In case of an ASCII character, this is indeed - // were we wish our new finger to be ("after" the found - // char in the paradigm of reverse iteration). For - // multibyte chars we need to skip down by the number of more - // bytes they have than ASCII - let shift = self.utf8_size - 1; - if index >= shift { - let found_char = index - shift; - if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - // move finger to before the character found (i.e., at its start index) - self.finger_back = found_char; - return Some((self.finger_back, self.finger_back + self.utf8_size)); - } - } - } - // We can't use finger_back = index - size + 1 here. If we found the last char - // of a different-sized character (or the middle byte of a different character) - // we need to bump the finger_back down to `index`. This similarly makes - // `finger_back` have the potential to no longer be on a boundary, - // but this is OK since we only exit this function on a boundary - // or when the haystack has been searched completely. - // - // Unlike next_match this does not - // have the problem of repeated bytes in utf-8 because - // we're searching for the last byte, and we can only have - // found the last byte when searching in reverse. - self.finger_back = index; - } else { - self.finger_back = self.finger; - // found nothing, exit - return None; - } - } + self.0.next_match_back() + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() } - - // let next_reject_back use the default implementation from the Searcher trait } -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} +impl<'a> DoubleEndedSearcher<&'a str> for CharSearcher<'a> {} /// Searches for chars that are equal to a given [`char`]. /// @@ -535,32 +141,19 @@ impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} /// /// ``` /// assert_eq!("Hello world".find('o'), Some(4)); +/// assert_eq!("Hello world".find('x'), None); /// ``` -impl<'a> Pattern<'a> for char { +impl<'a> Pattern<&'a str> for char { type Searcher = CharSearcher<'a>; #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - let mut utf8_encoded = [0; 4]; - let utf8_size = self.encode_utf8(&mut utf8_encoded).len(); - CharSearcher { - haystack, - finger: 0, - finger_back: haystack.len(), - needle: self, - utf8_size, - utf8_encoded, - } + CharSearcher::new(haystack, self) } #[inline] fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } + self.is_contained_in(str_bytes::Bytes::from(haystack)) } #[inline] @@ -570,23 +163,17 @@ impl<'a> Pattern<'a> for char { #[inline] fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { - self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack) + self.strip_prefix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from) } #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool - where - Self::Searcher: ReverseSearcher<'a>, - { - self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack) + fn is_suffix_of(self, haystack: &'a str) -> bool { + self.is_suffix_of(str_bytes::Bytes::from(haystack)) } #[inline] - fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> - where - Self::Searcher: ReverseSearcher<'a>, - { - self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack) + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { + self.strip_suffix_of(str_bytes::Bytes::from(haystack)).map(<&str>::from) } } @@ -639,7 +226,7 @@ struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_indices: super::CharIndices<'a>, } -impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { +impl<'a, C: MultiCharEq> Pattern<&'a str> for MultiCharEqPattern { type Searcher = MultiCharEqSearcher<'a, C>; #[inline] @@ -648,7 +235,7 @@ impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { } } -unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -673,7 +260,7 @@ unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { } } -unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<&'a str> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -693,7 +280,7 @@ unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, } } -impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<&'a str> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -724,7 +311,7 @@ macro_rules! pattern_methods { #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).is_suffix_of(haystack) } @@ -732,7 +319,7 @@ macro_rules! pattern_methods { #[inline] fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> where - $t: ReverseSearcher<'a>, + $t: ReverseSearcher<&'a str>, { ($pmap)(self).strip_suffix_of(haystack) } @@ -774,16 +361,16 @@ macro_rules! searcher_methods { }; } -/// Associated type for `<[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArraySearcher<'a, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); -/// Associated type for `<&[char; N] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char; N] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( - as Pattern<'a>>::Searcher, + as Pattern<&'a str>>::Searcher, ); /// Searches for chars that are equal to any of the [`char`]s in the array. @@ -794,15 +381,15 @@ pub struct CharArrayRefSearcher<'a, 'b, const N: usize>( /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(['l', 'l']), Some(2)); /// ``` -impl<'a, const N: usize> Pattern<'a> for [char; N] { +impl<'a, const N: usize> Pattern<&'a str> for [char; N] { pattern_methods!(CharArraySearcher<'a, N>, MultiCharEqPattern, CharArraySearcher); } -unsafe impl<'a, const N: usize> Searcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> Searcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(forward); } -unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> { +unsafe impl<'a, const N: usize> ReverseSearcher<&'a str> for CharArraySearcher<'a, N> { searcher_methods!(reverse); } @@ -814,15 +401,15 @@ unsafe impl<'a, const N: usize> ReverseSearcher<'a> for CharArraySearcher<'a, N> /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l']), Some(2)); /// ``` -impl<'a, 'b, const N: usize> Pattern<'a> for &'b [char; N] { +impl<'a, 'b, const N: usize> Pattern<&'a str> for &'b [char; N] { pattern_methods!(CharArrayRefSearcher<'a, 'b, N>, MultiCharEqPattern, CharArrayRefSearcher); } -unsafe impl<'a, 'b, const N: usize> Searcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> Searcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(forward); } -unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher<'a, 'b, N> { +unsafe impl<'a, 'b, const N: usize> ReverseSearcher<&'a str> for CharArrayRefSearcher<'a, 'b, N> { searcher_methods!(reverse); } @@ -832,19 +419,21 @@ unsafe impl<'a, 'b, const N: usize> ReverseSearcher<'a> for CharArrayRefSearcher // Todo: Change / Remove due to ambiguity in meaning. -/// Associated type for `<&[char] as Pattern<'a>>::Searcher`. +/// Associated type for `<&[char] as Pattern<&'a str>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>( + as Pattern<&'a str>>::Searcher, +); -unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); } -unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> { +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for CharSliceSearcher<'a, 'b> { searcher_methods!(reverse); } -impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} +impl<'a, 'b> DoubleEndedSearcher<&'a str> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the [`char`]s in the slice. /// @@ -854,7 +443,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// assert_eq!("Hello world".find(&['l', 'l'] as &[_]), Some(2)); /// assert_eq!("Hello world".find(&['l', 'l'][..]), Some(2)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b [char] { +impl<'a, 'b> Pattern<&'a str> for &'b [char] { pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } @@ -862,9 +451,9 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { // Impl for F: FnMut(char) -> bool ///////////////////////////////////////////////////////////////////////////// -/// Associated type for `>::Searcher`. +/// Associated type for `>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>( as Pattern<&'a str>>::Searcher) where F: FnMut(char) -> bool; @@ -879,21 +468,24 @@ where .finish() } } -unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> Searcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(forward); } -unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F> +unsafe impl<'a, F> ReverseSearcher<&'a str> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool, { searcher_methods!(reverse); } -impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {} +impl<'a, F> DoubleEndedSearcher<&'a str> for CharPredicateSearcher<'a, F> where + F: FnMut(char) -> bool +{ +} /// Searches for [`char`]s that match the given predicate. /// @@ -903,7 +495,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: Fn /// assert_eq!("Hello world".find(char::is_uppercase), Some(0)); /// assert_eq!("Hello world".find(|c| "aeiou".contains(c)), Some(1)); /// ``` -impl<'a, F> Pattern<'a> for F +impl<'a, F> Pattern<&'a str> for F where F: FnMut(char) -> bool, { @@ -915,7 +507,7 @@ where ///////////////////////////////////////////////////////////////////////////// /// Delegates to the `&str` impl. -impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { +impl<'a, 'b, 'c> Pattern<&'a str> for &'c &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } @@ -933,7 +525,7 @@ impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str { /// ``` /// assert_eq!("Hello world".find("world"), Some(6)); /// ``` -impl<'a, 'b> Pattern<'a> for &'b str { +impl<'a, 'b> Pattern<&'a str> for &'b str { type Searcher = StrSearcher<'a, 'b>; #[inline] @@ -1008,707 +600,50 @@ impl<'a, 'b> Pattern<'a> for &'b str { ///////////////////////////////////////////////////////////////////////////// #[derive(Clone, Debug)] -/// Associated type for `<&str as Pattern<'a>>::Searcher`. -pub struct StrSearcher<'a, 'b> { - haystack: &'a str, - needle: &'b str, - - searcher: StrSearcherImpl, -} - -#[derive(Clone, Debug)] -enum StrSearcherImpl { - Empty(EmptyNeedle), - TwoWay(TwoWaySearcher), -} - -#[derive(Clone, Debug)] -struct EmptyNeedle { - position: usize, - end: usize, - is_match_fw: bool, - is_match_bw: bool, - // Needed in case of an empty haystack, see #85462 - is_finished: bool, -} +/// Associated type for `<&str as Pattern<&'a str>>::Searcher`. +pub struct StrSearcher<'a, 'b>(crate::str_bytes::StrSearcher<'a, 'b, crate::str_bytes::Utf8>); impl<'a, 'b> StrSearcher<'a, 'b> { fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { - if needle.is_empty() { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::Empty(EmptyNeedle { - position: 0, - end: haystack.len(), - is_match_fw: true, - is_match_bw: true, - is_finished: false, - }), - } - } else { - StrSearcher { - haystack, - needle, - searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new( - needle.as_bytes(), - haystack.len(), - )), - } - } + let haystack = crate::str_bytes::Bytes::from(haystack); + Self(crate::str_bytes::StrSearcher::new(haystack, needle)) } } -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { +unsafe impl<'a, 'b> Searcher<&'a str> for StrSearcher<'a, 'b> { #[inline] fn haystack(&self) -> &'a str { - self.haystack + self.0.haystack().into() } #[inline] fn next(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - if searcher.is_finished { - return SearchStep::Done; - } - // empty needle rejects every char and matches every empty string between them - let is_match = searcher.is_match_fw; - searcher.is_match_fw = !searcher.is_match_fw; - let pos = searcher.position; - match self.haystack[pos..].chars().next() { - _ if is_match => SearchStep::Match(pos, pos), - None => { - searcher.is_finished = true; - SearchStep::Done - } - Some(ch) => { - searcher.position += ch.len_utf8(); - SearchStep::Reject(pos, searcher.position) - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - // TwoWaySearcher produces valid *Match* indices that split at char boundaries - // as long as it does correct matching and that haystack and needle are - // valid UTF-8 - // *Rejects* from the algorithm can fall on any indices, but we will walk them - // manually to the next character boundary, so that they are utf-8 safe. - if searcher.position == self.haystack.len() { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long, - ) { - SearchStep::Reject(a, mut b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(b) { - b += 1; - } - searcher.position = cmp::max(b, searcher.position); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } + self.0.next() } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(..) => loop { - match self.next() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => {} - } - }, - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false` cases to encourage the compiler - // to specialize the two cases separately. - if is_long { - searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - true, - ) - } else { - searcher.next::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - false, - ) - } - } - } + self.0.next_match() } -} -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn next_back(&mut self) -> SearchStep { - match self.searcher { - StrSearcherImpl::Empty(ref mut searcher) => { - if searcher.is_finished { - return SearchStep::Done; - } - let is_match = searcher.is_match_bw; - searcher.is_match_bw = !searcher.is_match_bw; - let end = searcher.end; - match self.haystack[..end].chars().next_back() { - _ if is_match => SearchStep::Match(end, end), - None => { - searcher.is_finished = true; - SearchStep::Done - } - Some(ch) => { - searcher.end -= ch.len_utf8(); - SearchStep::Reject(searcher.end, end) - } - } - } - StrSearcherImpl::TwoWay(ref mut searcher) => { - if searcher.end == 0 { - return SearchStep::Done; - } - let is_long = searcher.memory == usize::MAX; - match searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - is_long, - ) { - SearchStep::Reject(mut a, b) => { - // skip to next char boundary - while !self.haystack.is_char_boundary(a) { - a -= 1; - } - searcher.end = cmp::min(a, searcher.end); - SearchStep::Reject(a, b) - } - otherwise => otherwise, - } - } - } - } - - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - match self.searcher { - StrSearcherImpl::Empty(..) => loop { - match self.next_back() { - SearchStep::Match(a, b) => return Some((a, b)), - SearchStep::Done => return None, - SearchStep::Reject(..) => {} - } - }, - StrSearcherImpl::TwoWay(ref mut searcher) => { - let is_long = searcher.memory == usize::MAX; - // write out `true` and `false`, like `next_match` - if is_long { - searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - true, - ) - } else { - searcher.next_back::( - self.haystack.as_bytes(), - self.needle.as_bytes(), - false, - ) - } - } - } + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() } } -/// The internal state of the two-way substring search algorithm. -#[derive(Clone, Debug)] -struct TwoWaySearcher { - // constants - /// critical factorization index - crit_pos: usize, - /// critical factorization index for reversed needle - crit_pos_back: usize, - period: usize, - /// `byteset` is an extension (not part of the two way algorithm); - /// it's a 64-bit "fingerprint" where each set bit `j` corresponds - /// to a (byte & 63) == j present in the needle. - byteset: u64, - - // variables - position: usize, - end: usize, - /// index into needle before which we have already matched - memory: usize, - /// index into needle after which we have already matched - memory_back: usize, -} - -/* - This is the Two-Way search algorithm, which was introduced in the paper: - Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. - - Here's some background information. - - A *word* is a string of symbols. The *length* of a word should be a familiar - notion, and here we denote it for any word x by |x|. - (We also allow for the possibility of the *empty word*, a word of length zero). - - If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a - *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. - For example, both 1 and 2 are periods for the string "aa". As another example, - the only period of the string "abcd" is 4. - - We denote by period(x) the *smallest* period of x (provided that x is non-empty). - This is always well-defined since every non-empty word x has at least one period, - |x|. We sometimes call this *the period* of x. - - If u, v and x are words such that x = uv, where uv is the concatenation of u and - v, then we say that (u, v) is a *factorization* of x. - - Let (u, v) be a factorization for a word x. Then if w is a non-empty word such - that both of the following hold - - - either w is a suffix of u or u is a suffix of w - - either w is a prefix of v or v is a prefix of w - - then w is said to be a *repetition* for the factorization (u, v). - - Just to unpack this, there are four possibilities here. Let w = "abc". Then we - might have: - - - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") - - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") - - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") - - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") - - Note that the word vu is a repetition for any factorization (u,v) of x = uv, - so every factorization has at least one repetition. - - If x is a string and (u, v) is a factorization for x, then a *local period* for - (u, v) is an integer r such that there is some word w such that |w| = r and w is - a repetition for (u, v). - - We denote by local_period(u, v) the smallest local period of (u, v). We sometimes - call this *the local period* of (u, v). Provided that x = uv is non-empty, this - is well-defined (because each non-empty word has at least one factorization, as - noted above). - - It can be proven that the following is an equivalent definition of a local period - for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for - all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are - defined. (i.e., i > 0 and i + r < |x|). - - Using the above reformulation, it is easy to prove that - - 1 <= local_period(u, v) <= period(uv) - - A factorization (u, v) of x such that local_period(u,v) = period(x) is called a - *critical factorization*. - - The algorithm hinges on the following theorem, which is stated without proof: - - **Critical Factorization Theorem** Any word x has at least one critical - factorization (u, v) such that |u| < period(x). - - The purpose of maximal_suffix is to find such a critical factorization. - - If the period is short, compute another factorization x = u' v' to use - for reverse search, chosen instead so that |v'| < period(x). - -*/ -impl TwoWaySearcher { - fn new(needle: &[u8], end: usize) -> TwoWaySearcher { - let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); - let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); - - let (crit_pos, period) = if crit_pos_false > crit_pos_true { - (crit_pos_false, period_false) - } else { - (crit_pos_true, period_true) - }; - - // A particularly readable explanation of what's going on here can be found - // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically - // see the code for "Algorithm CP" on p. 323. - // - // What's going on is we have some critical factorization (u, v) of the - // needle, and we want to determine whether u is a suffix of - // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use - // "Algorithm CP2", which is optimized for when the period of the needle - // is large. - if needle[..crit_pos] == needle[period..period + crit_pos] { - // short period case -- the period is exact - // compute a separate critical factorization for the reversed needle - // x = u' v' where |v'| < period(x). - // - // This is sped up by the period being known already. - // Note that a case like x = "acba" may be factored exactly forwards - // (crit_pos = 1, period = 3) while being factored with approximate - // period in reverse (crit_pos = 2, period = 2). We use the given - // reverse factorization but keep the exact period. - let crit_pos_back = needle.len() - - cmp::max( - TwoWaySearcher::reverse_maximal_suffix(needle, period, false), - TwoWaySearcher::reverse_maximal_suffix(needle, period, true), - ); - - TwoWaySearcher { - crit_pos, - crit_pos_back, - period, - byteset: Self::byteset_create(&needle[..period]), - - position: 0, - end, - memory: 0, - memory_back: needle.len(), - } - } else { - // long period case -- we have an approximation to the actual period, - // and don't use memorization. - // - // Approximate the period by lower bound max(|u|, |v|) + 1. - // The critical factorization is efficient to use for both forward and - // reverse search. - - TwoWaySearcher { - crit_pos, - crit_pos_back: crit_pos, - period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, - byteset: Self::byteset_create(needle), - - position: 0, - end, - memory: usize::MAX, // Dummy value to signify that the period is long - memory_back: usize::MAX, - } - } - } - +unsafe impl<'a, 'b> ReverseSearcher<&'a str> for StrSearcher<'a, 'b> { #[inline] - fn byteset_create(bytes: &[u8]) -> u64 { - bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) - } - - #[inline] - fn byteset_contains(&self, byte: u8) -> bool { - (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 - } - - // One of the main ideas of Two-Way is that we factorize the needle into - // two halves, (u, v), and begin trying to find v in the haystack by scanning - // left to right. If v matches, we try to match u by scanning right to left. - // How far we can jump when we encounter a mismatch is all based on the fact - // that (u, v) is a critical factorization for the needle. - #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output - where - S: TwoWayStrategy, - { - // `next()` uses `self.position` as its cursor - let old_pos = self.position; - let needle_last = needle.len() - 1; - 'search: loop { - // Check that we have room to search in - // position + needle_last can not overflow if we assume slices - // are bounded by isize's range. - let tail_byte = match haystack.get(self.position + needle_last) { - Some(&b) => b, - None => { - self.position = haystack.len(); - return S::rejecting(old_pos, self.position); - } - }; - - if S::use_early_reject() && old_pos != self.position { - return S::rejecting(old_pos, self.position); - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(tail_byte) { - self.position += needle.len(); - if !long_period { - self.memory = 0; - } - continue 'search; - } - - // See if the right part of the needle matches - let start = - if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; - for i in start..needle.len() { - if needle[i] != haystack[self.position + i] { - self.position += i - self.crit_pos + 1; - if !long_period { - self.memory = 0; - } - continue 'search; - } - } - - // See if the left part of the needle matches - let start = if long_period { 0 } else { self.memory }; - for i in (start..self.crit_pos).rev() { - if needle[i] != haystack[self.position + i] { - self.position += self.period; - if !long_period { - self.memory = needle.len() - self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.position; - - // Note: add self.period instead of needle.len() to have overlapping matches - self.position += needle.len(); - if !long_period { - self.memory = 0; // set to needle.len() - self.period for overlapping matches - } - - return S::matching(match_pos, match_pos + needle.len()); - } - } - - // Follows the ideas in `next()`. - // - // The definitions are symmetrical, with period(x) = period(reverse(x)) - // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) - // is a critical factorization, so is (reverse(v), reverse(u)). - // - // For the reverse case we have computed a critical factorization x = u' v' - // (field `crit_pos_back`). We need |u| < period(x) for the forward case and - // thus |v'| < period(x) for the reverse. - // - // To search in reverse through the haystack, we search forward through - // a reversed haystack with a reversed needle, matching first u' and then v'. - #[inline] - fn next_back(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output - where - S: TwoWayStrategy, - { - // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` - // are independent. - let old_end = self.end; - 'search: loop { - // Check that we have room to search in - // end - needle.len() will wrap around when there is no more room, - // but due to slice length limits it can never wrap all the way back - // into the length of haystack. - let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { - Some(&b) => b, - None => { - self.end = 0; - return S::rejecting(0, old_end); - } - }; - - if S::use_early_reject() && old_end != self.end { - return S::rejecting(self.end, old_end); - } - - // Quickly skip by large portions unrelated to our substring - if !self.byteset_contains(front_byte) { - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - - // See if the left part of the needle matches - let crit = if long_period { - self.crit_pos_back - } else { - cmp::min(self.crit_pos_back, self.memory_back) - }; - for i in (0..crit).rev() { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.crit_pos_back - i; - if !long_period { - self.memory_back = needle.len(); - } - continue 'search; - } - } - - // See if the right part of the needle matches - let needle_end = if long_period { needle.len() } else { self.memory_back }; - for i in self.crit_pos_back..needle_end { - if needle[i] != haystack[self.end - needle.len() + i] { - self.end -= self.period; - if !long_period { - self.memory_back = self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.end - needle.len(); - // Note: sub self.period instead of needle.len() to have overlapping matches - self.end -= needle.len(); - if !long_period { - self.memory_back = needle.len(); - } - - return S::matching(match_pos, match_pos + needle.len()); - } - } - - // Compute the maximal suffix of `arr`. - // - // The maximal suffix is a possible critical factorization (u, v) of `arr`. - // - // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the - // period of v. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - #[inline] - fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - - while let Some(&a) = arr.get(right + offset) { - // `left` will be inbounds when `right` is. - let b = arr[left + offset]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - } - (left, period) - } - - // Compute the maximal suffix of the reverse of `arr`. - // - // The maximal suffix is a possible critical factorization (u', v') of `arr`. - // - // Returns `i` where `i` is the starting index of v', from the back; - // returns immediately when a period of `known_period` is reached. - // - // `order_greater` determines if lexical order is `<` or `>`. Both - // orders must be computed -- the ordering with the largest `i` gives - // a critical factorization. - // - // For long period cases, the resulting period is not exact (it is too short). - fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { - let mut left = 0; // Corresponds to i in the paper - let mut right = 1; // Corresponds to j in the paper - let mut offset = 0; // Corresponds to k in the paper, but starting at 0 - // to match 0-based indexing. - let mut period = 1; // Corresponds to p in the paper - let n = arr.len(); - - while right + offset < n { - let a = arr[n - (1 + right + offset)]; - let b = arr[n - (1 + left + offset)]; - if (a < b && !order_greater) || (a > b && order_greater) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1; - offset = 0; - period = right - left; - } else if a == b { - // Advance through repetition of the current period. - if offset + 1 == period { - right += offset + 1; - offset = 0; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 0; - period = 1; - } - if period == known_period { - break; - } - } - debug_assert!(period <= known_period); - left + fn next_back(&mut self) -> SearchStep { + self.0.next_back() } -} - -// TwoWayStrategy allows the algorithm to either skip non-matches as quickly -// as possible, or to work in a mode where it emits Rejects relatively quickly. -trait TwoWayStrategy { - type Output; - fn use_early_reject() -> bool; - fn rejecting(a: usize, b: usize) -> Self::Output; - fn matching(a: usize, b: usize) -> Self::Output; -} - -/// Skip to match intervals as quickly as possible -enum MatchOnly {} - -impl TwoWayStrategy for MatchOnly { - type Output = Option<(usize, usize)>; #[inline] - fn use_early_reject() -> bool { - false - } - #[inline] - fn rejecting(_a: usize, _b: usize) -> Self::Output { - None - } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { - Some((a, b)) + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() } -} - -/// Emit Rejects regularly -enum RejectAndMatch {} - -impl TwoWayStrategy for RejectAndMatch { - type Output = SearchStep; - #[inline] - fn use_early_reject() -> bool { - true - } - #[inline] - fn rejecting(a: usize, b: usize) -> Self::Output { - SearchStep::Reject(a, b) - } - #[inline] - fn matching(a: usize, b: usize) -> Self::Output { - SearchStep::Match(a, b) + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() } } diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 2acef432f2063..c66a06837f39f 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -76,7 +76,7 @@ pub unsafe fn next_code_point<'a, I: Iterator>(bytes: &mut I) -> /// /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string #[inline] -pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option +pub(crate) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option where I: DoubleEndedIterator, { @@ -120,6 +120,87 @@ const fn contains_nonascii(x: usize) -> bool { (x & NONASCII_MASK) != 0 } +/// Reads the first code point out of a byte slice validating whether it’s +/// valid. +/// +/// This is different than [`next_code_point`] in that it doesn’t assume +/// argument is well-formed UTF-8-like string. Together with the character its +/// encoded length is returned. +/// +/// If front of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that +/// includes a WTF-8 encoded surrogate) returns `None`. +/// +/// ``` +/// #![feature(str_internals)] +/// use core::str::try_next_code_point; +/// +/// assert_eq!(Some(('f', 1)), try_next_code_point(b"foo".as_ref())); +/// assert_eq!(Some(('Ż', 2)), try_next_code_point("Żółw".as_bytes())); +/// assert_eq!(None, try_next_code_point(b"\xffoo".as_ref())); +/// ``` +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub const fn try_next_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let first = match bytes.first() { + Some(&byte) => byte, + None => return None, + }; + let (value, length) = if first < 0x80 { + (first as u32, 1) + } else if let Ok((cp, len)) = try_finish_byte_sequence(first, bytes, 0) { + (cp, len) + } else { + return None; + }; + // SAFETY: We’ve just verified value is correct Unicode scalar value. + // Either ASCII (first branch of the if-else-if-else) or non-ASCII Unicode + // character (second branch). + Some((unsafe { char::from_u32_unchecked(value) }, length)) +} + +/// Reads the last code point out of a byte slice validating whether it’s +/// valid. +/// +/// This is different than `next_code_point_reverse` in that it doesn’t assume +/// argument is well-formed UTF-8-like string. Together with the character its +/// encoded length is returned. +/// +/// If back of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that +/// includes a WTF-8 encoded surrogate) returns `None`. +/// +/// ``` +/// #![feature(str_internals)] +/// use core::str::try_next_code_point_reverse; +/// +/// assert_eq!(Some(('o', 1)), try_next_code_point_reverse(b"foo".as_ref())); +/// assert_eq!(Some(('‽', 3)), try_next_code_point_reverse("Uh‽".as_bytes())); +/// assert_eq!(None, try_next_code_point_reverse(b"foo\xff".as_ref())); +/// ``` +#[unstable(feature = "str_internals", issue = "none")] +#[inline] +pub const fn try_next_code_point_reverse(bytes: &[u8]) -> Option<(char, usize)> { + let mut n = 1; + let limit = bytes.len(); + let limit = if limit < 4 { limit } else { 4 }; // not .min(4) because of const + while n <= limit && !bytes[bytes.len() - n].is_utf8_char_boundary() { + n += 1; + } + if n <= limit { + // It’s not clear to me why, but range indexing isn’t const here, + // i.e. `&bytes[bytes.len() - n..]` doesn’t compile. Because of that + // I’m resorting to unsafe block with from_raw_parts. + // SAFETY: n ≤ limit ≤ bytes.len() thus bytes.len() - n ≥ 0 and we + // have n remaining bytes. + let bytes = unsafe { crate::slice::from_raw_parts(bytes.as_ptr().add(bytes.len() - n), n) }; + if let Some((chr, len)) = try_next_code_point(bytes) { + if n == len { + return Some((chr, len)); + } + } + } + None +} + /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. #[inline(always)] @@ -134,78 +215,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let align = v.as_ptr().align_offset(usize_bytes); while index < len { - let old_offset = index; - macro_rules! err { - ($error_len: expr) => { - return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len }) - }; - } - - macro_rules! next { - () => {{ - index += 1; - // we needed data, but there was none: error! - if index >= len { - err!(None) - } - v[index] - }}; - } - + let valid_up_to = index; let first = v[index]; if first >= 128 { - let w = utf8_char_width(first); - // 2-byte encoding is for codepoints \u{0080} to \u{07ff} - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u{0800} to \u{ffff} - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \u{d800} to \u{dfff} - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - match w { - 2 => { - if next!() as i8 >= -64 { - err!(Some(1)) - } - } - 3 => { - match (first, next!()) { - (0xE0, 0xA0..=0xBF) - | (0xE1..=0xEC, 0x80..=0xBF) - | (0xED, 0x80..=0x9F) - | (0xEE..=0xEF, 0x80..=0xBF) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - } - 4 => { - match (first, next!()) { - (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} - _ => err!(Some(1)), - } - if next!() as i8 >= -64 { - err!(Some(2)) - } - if next!() as i8 >= -64 { - err!(Some(3)) - } - } - _ => err!(Some(1)), + match try_finish_byte_sequence(first, v, index) { + Ok((_value, length)) => index += length, + Err(error_len) => return Err(Utf8Error { valid_up_to, error_len }), } - index += 1; } else { // Ascii case, try to skip forward quickly. // When the pointer is aligned, read 2 words of data per iteration @@ -241,6 +257,93 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { Ok(()) } +/// Try to finish an UTF-8 byte sequence. +/// +/// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that +/// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`. +/// +/// If the byte sequence at the index is correct, returns decoded code point and +/// length of the sequence. If it was invalid returns number of invalid bytes +/// or None if read was cut short. +#[inline(always)] +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +const fn try_finish_byte_sequence( + first: u8, + bytes: &[u8], + index: usize, +) -> Result<(u32, usize), Option> { + macro_rules! get { + (raw $offset:expr) => { + if index + $offset < bytes.len() { + bytes[index + $offset] + } else { + return Err(None) + } + }; + (cont $offset:expr) => {{ + let byte = get!(raw $offset); + if !utf8_is_cont_byte(byte) { + return Err(Some($offset as u8)) + } + byte + }} + } + + // 2-byte encoding is for codepoints \u{0080} to \u{07ff} + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u{0800} to \u{ffff} + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \u{d800} to \u{dfff} + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match utf8_char_width(first) { + 2 => { + let second = get!(cont 1); + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + Ok((value, 2)) + } + 3 => { + let second = get!(raw 1); + match (first, second) { + (0xE0, 0xA0..=0xBF) + | (0xE1..=0xEC, 0x80..=0xBF) + | (0xED, 0x80..=0x9F) + | (0xEE..=0xEF, 0x80..=0xBF) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 3); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + Ok((value, 3)) + } + 4 => { + let second = get!(raw 1); + match (first, second) { + (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {} + _ => return Err(Some(1)), + } + let value = utf8_first_byte(first, 4); + let value = utf8_acc_cont_byte(value, second); + let value = utf8_acc_cont_byte(value, get!(cont 2)); + let value = utf8_acc_cont_byte(value, get!(cont 3)); + Ok((value, 4)) + } + _ => Err(Some(1)), + } +} + // https://tools.ietf.org/html/rfc3629 const UTF8_CHAR_WIDTH: &[u8; 256] = &[ // 1 2 3 4 5 6 7 8 9 A B C D E F diff --git a/library/core/src/str_bytes.rs b/library/core/src/str_bytes.rs new file mode 100644 index 0000000000000..bc781e345b5b1 --- /dev/null +++ b/library/core/src/str_bytes.rs @@ -0,0 +1,1719 @@ +//! Module provides pattern matching features for string-like bytes slice. +//! +//! A ‘string-like bytes slice’ means that types and functions here try to +//! interpret bytes slices as well-formed WTF-8 but don’t assume it is and treat +//! bytes in invalid portions of the slices as characters for the purpose of +//! deciding where character boundaries lie. This can be demonstrated by how +//! empty pattern is matched (since empty patterns match character boundaries): +//! +//! ``` +//! #![feature(pattern, pattern_internals, str_internals)] +//! use core::pattern::{Pattern, Searcher}; +//! use core::str_bytes::Bytes; +//! +//! let data = ["Żółw".as_bytes(), &b"\xff\xff\xff"[..], "🕴".as_bytes()].concat(); +//! let mut searcher = "".into_searcher(Bytes::from(data.as_slice())); +//! let next = move || searcher.next_match().map(|(x, _)| x); +//! let boundaries = core::iter::from_fn(next).collect::>(); +//! assert_eq!(&[0, 2, 4, 6, 7, 8, 9, 10, 14][..], &boundaries[..]); +//! ``` +#![unstable(feature = "str_internals", issue = "none")] + +use crate::cmp; +use crate::marker::PhantomData; +use crate::mem::take; +use crate::ops; +use crate::pattern; +use crate::pattern::{Haystack, MatchOnly, Pattern, RejectOnly, SearchStep}; +use crate::str::{ + next_code_point, next_code_point_reverse, try_next_code_point, try_next_code_point_reverse, + utf8_char_width, +}; + +type OptRange = Option<(usize, usize)>; +type Range = ops::Range; + +//////////////////////////////////////////////////////////////////////////////// +// Bytes wrapper +//////////////////////////////////////////////////////////////////////////////// + +/// A reference to a string-like bytes slice. +/// +/// ‘String-like’ refers to the fact that parts of the data are valid WTF-8 and +/// when we split the slice we don’t want to split well-formed WTF-8 bytes +/// sequences. This is in a sense a generalisation of a `&str` which allows +/// portions of the buffer to be ill-formed while preserving correctness of +/// existing well-formed parts. +/// +/// The `F` generic argument tags the slice with a [flavour][Flavour] which +/// specifies structure of the data. +#[derive(Copy, Clone, Debug)] +pub struct Bytes<'a, F>(&'a [u8], PhantomData); + +impl<'a, F: Flavour> Bytes<'a, F> { + /// Creates a new `Bytes` wrapper around bytes slice. + /// + /// # Safety + /// + /// Caller must guarantee that the bytes adhere to the requirements for the + /// flavour `F`. E.g. for [`Wtf8`] flavour, the bytes must be well-formed + /// WTF-8 encoded string. + /// + /// It may be more convenient to use `Bytes::From` implementations which are + /// provided for `&str`, `&OsStr` and `&[u8]`. + pub unsafe fn new(bytes: &'a [u8]) -> Bytes<'a, F> { + Self(bytes, PhantomData) + } +} + +impl<'a, F: Flavour> Bytes<'a, F> { + pub fn as_bytes(self) -> &'a [u8] { + self.0 + } + + pub fn len(self) -> usize { + self.0.len() + } + + pub fn is_empty(self) -> bool { + self.0.is_empty() + } + + /// Adjusts range’s start position forward so it points at a potential valid + /// WTF-8 byte sequence. + /// + /// `range` represents a possibly invalid range within the bytes; + /// furthermore, `range.start` must be non-zero. This method returns a new + /// start index which is a valid split position. If `range` is already + /// a valid, the method simply returns `range.start`. + /// + /// When dealing with ill-formed WTF-8 sequences, this is not guaranteed to + /// advance position byte at a time. If you need to be able to advance + /// position byte at a time use `advance_range_start` instead. + fn adjust_position_fwd(self, range: Range) -> usize { + F::adjust_position_fwd(self.as_bytes(), range) + } + + /// Adjusts position backward so that it points at the closest potential + /// valid WTF-8 sequence. + /// + /// `range` represents a possibly invalid range within the bytes, + /// furthermore `range.end` must be less that bytes’ length. This method + /// returns a new exnd index which is a valid split position. If `range` is + /// already a valid, the method simply returns `range.end`. + /// + /// When dealing with ill-formed WTF-8 sequences, this is not guaranteed to + /// advance position byte at a time. If you need to be able to advance + /// position character at a time use `advance_range_end` instead. + fn adjust_position_bwd(self, range: Range) -> usize { + F::adjust_position_bwd(self.as_bytes(), range) + } + + /// Given a valid range update it’s start so it falls on the next character + /// boundary. + /// + /// `range` must be non-empty. If it starts with a valid WTF-8 sequence, + /// this method returns position pass that sequence. Otherwise, it returns + /// `range.start + 1`. In other words, well-formed WTF-8 bytes sequence are + /// skipped in one go while ill-formed sequences are skipped byte-by-byte. + fn advance_range_start(self, range: Range) -> usize { + range.start + F::advance_range_start(&self.as_bytes()[range]) + } + + /// Given a valid range update it’s end so it falls on the previous + /// character boundary. + /// + /// `range` must be non-empty. If it ends with a valid WTF-8 sequence, this + /// method returns position of the start of that sequence. Otherwise, it + /// returns `range.end - 1`. In other words, well-formed WTF-8 bytes + /// sequence are skipped in one go while ill-formed sequences are skipped + /// byte-by-byte. + fn advance_range_end(self, range: Range) -> usize { + range.start + F::advance_range_end(&self.as_bytes()[range]) + } + + /// Returns valid UTF-8 character at the front of the slice. + /// + /// If slice doesn’t start with a valid UTF-8 sequence, returns `None`. + /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. + /// WTF-8 sequences which encode surrogates are considered invalid. + fn get_first_code_point(self) -> Option<(char, usize)> { + F::get_first_code_point(self.as_bytes()) + } + + /// Returns valid UTF-8 character at the end of the slice. + /// + /// If slice doesn’t end with a valid UTF-8 sequence, returns `None`. + /// Otherwise returns decoded character and it’s UTF-8 encoding’s length. + /// WTF-8 sequences which encode surrogates are considered invalid. + fn get_last_code_point(self) -> Option<(char, usize)> { + F::get_last_code_point(self.as_bytes()) + } + + /// Looks for the next UTF-8-encoded character in the slice. + /// + /// WTF-8 sequences which encode surrogates are considered invalid. + /// + /// Returns position of the match, decoded character and UTF-8 length of + /// that character. + fn find_code_point_fwd(self, range: Range) -> Option<(usize, char, usize)> { + F::find_code_point_fwd(&self.as_bytes()[range.clone()]) + .map(|(pos, chr, len)| (range.start + pos, chr, len)) + } + + /// Looks backwards for the next UTF-8 encoded character in the slice. + /// + /// WTF-8 sequences which encode surrogates are considered invalid. + /// + /// Returns position of the match, decoded character and UTF-8 length of + /// that character. + fn find_code_point_bwd(&self, range: Range) -> Option<(usize, char, usize)> { + F::find_code_point_bwd(&self.as_bytes()[range.clone()]) + .map(|(pos, chr, len)| (range.start + pos, chr, len)) + } +} + +impl<'a> From<&'a [u8]> for Bytes<'a, Unstructured> { + #[inline] + fn from(val: &'a [u8]) -> Self { + Self(val, PhantomData) + } +} + +impl<'a> From<&'a str> for Bytes<'a, Utf8> { + #[inline] + fn from(val: &'a str) -> Self { + // SAFETY: `str`’s bytes ares guaranteed to be UTF-8 so `Utf8` flavour + // is correct. + unsafe { Bytes::new(val.as_bytes()) } + } +} + +impl<'a> From> for &'a str { + #[inline] + fn from(bytes: Bytes<'a, Utf8>) -> &'a str { + if cfg!(debug_assertions) { + crate::str::from_utf8(bytes.as_bytes()).unwrap() + } else { + // SAFETY: Bytes has been created from &str and we’ve been + // maintaining UTF-8 format. + unsafe { crate::str::from_utf8_unchecked(bytes.as_bytes()) } + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum Unstructured {} +#[derive(Clone, Copy, Debug)] +pub enum Wtf8 {} +#[derive(Clone, Copy, Debug)] +pub enum Utf8 {} + +/// A marker trait indicating ‘flavour’ of data referred by [`Bytes`] type. +/// +/// The trait abstracts away operations related to identifying and decoding +/// ‘characters’ from a bytes slice. A valid WTF-8 byte sequence is always +/// treated as indivisible ‘character’ but depending on the flavour code can +/// make different assumption about contents of the bytes slice: +/// - [`Unstructured`] flavoured bytes slice may contain ill-formed bytes +/// sequences and in those each byte is treated as separate ‘character’, +/// - [`Wtf8`] flavoured bytes slice is a well-formed WTF-8-encoded string (that +/// is some of the byte sequences may encode surrogate code points) and +/// - [`Utf8`] flavoured bytes slice is a well-formed UTF-8-encoded string (that +/// is all byte sequences encode valid Unicode code points). +pub trait Flavour: private::Flavour {} + +impl Flavour for Unstructured {} +impl Flavour for Wtf8 {} +impl Flavour for Utf8 {} + +mod private { + use super::*; + + /// Private methods of the [`super::Flavour`] trait. + pub trait Flavour: Copy + core::fmt::Debug { + const IS_WTF8: bool; + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize; + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize; + fn advance_range_start(bytes: &[u8]) -> usize; + fn advance_range_end(bytes: &[u8]) -> usize; + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)>; + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)>; + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)>; + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)>; + } + + impl Flavour for super::Unstructured { + const IS_WTF8: bool = false; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + range.start + + bytes[range.clone()].iter().take_while(|chr| !chr.is_utf8_char_boundary()).count() + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + range.end + - bytes[range.start..range.end + 1] + .iter() + .rev() + .take_while(|chr| !chr.is_utf8_char_boundary()) + .count() + } + + fn advance_range_start(bytes: &[u8]) -> usize { + assert!(!bytes.is_empty()); + try_next_code_point(bytes).map_or(1, |(_, len)| len) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + assert!(!bytes.is_empty()); + bytes.len() - try_next_code_point_reverse(bytes).map_or(1, |(_, len)| len) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + try_next_code_point(bytes) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + try_next_code_point_reverse(bytes) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + (0..bytes.len()) + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((pos, chr, len)) + }) + .next() + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + (0..bytes.len()) + .rev() + .filter_map(|pos| { + let (chr, len) = try_next_code_point(&bytes[pos..])?; + Some((pos, chr, len)) + }) + .next() + } + } + + impl Flavour for Wtf8 { + const IS_WTF8: bool = true; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + let mut pos = range.start; + // Input is WTF-8 so we will never need to move more than three + // positions. This happens when we’re at pointing at the first + // continuation byte of a four-byte sequence. Unroll the loop. + for _ in 0..3 { + // We’re not checking pos against _end because we know that _end + // == bytes.len() or falls on a character boundary. We can + // therefore compare against bytes.len() and eliminate that + // comparison. + if bytes.get(pos).map_or(true, |b: &u8| b.is_utf8_char_boundary()) { + break; + } + pos += 1; + } + pos + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + let mut pos = range.end; + // Input is WTF-8 so we will never need to move more than three + // positions. This happens when we’re at pointing at the first + // continuation byte of a four-byte sequence. Unroll the loop. + for _ in 0..3 { + // SAFETY: `bytes` is well-formed WTF-8 sequence and at function + // start `pos` is index within `bytes`. Therefore, `bytes[pos]` + // is valid and a) if it’s a character boundary we exit the + // function or b) otherwise we know that `pos > 0` (because + // otherwise `bytes` wouldn’t be well-formed WTF-8). + if unsafe { bytes.get_unchecked(pos) }.is_utf8_char_boundary() { + break; + } + pos -= 1; + } + pos + } + + fn advance_range_start(bytes: &[u8]) -> usize { + // Input is valid WTF-8 so we can just deduce length of next + // sequence to skip from the frist byte. + utf8_char_width(*bytes.get(0).unwrap()) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + let end = bytes.len().checked_sub(1).unwrap(); + Self::adjust_position_bwd(bytes, 0..end) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point(&mut bytes.iter())? }; + // WTF-8 might produce surrogate code points so we still need to + // verify that we got a valid character. + char::from_u32(cp).map(|chr| (chr, len_utf8(cp))) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point_reverse(&mut bytes.iter().rev())? }; + // WTF-8 might produce surrogate code points so we still need to + // verify that we got a valid character. + char::from_u32(cp).map(|chr| (chr, len_utf8(cp))) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + let mut iter = bytes.iter(); + let mut pos = 0; + loop { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point(&mut iter)? }; + let len = len_utf8(cp); + if let Some(chr) = char::from_u32(cp) { + return Some((pos, chr, len)); + } + pos += len; + } + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + let mut iter = bytes.iter().rev(); + let mut pos = bytes.len(); + loop { + // SAFETY: We’re Wtf8 flavour. Client promises that bytes are + // well-formed WTF-8. + let cp = unsafe { next_code_point_reverse(&mut iter)? }; + let len = len_utf8(cp); + pos -= len; + if let Some(chr) = char::from_u32(cp) { + return Some((pos, chr, len)); + } + } + } + } + + impl Flavour for Utf8 { + const IS_WTF8: bool = true; + + fn adjust_position_fwd(bytes: &[u8], range: Range) -> usize { + Wtf8::adjust_position_fwd(bytes, range) + } + + fn adjust_position_bwd(bytes: &[u8], range: Range) -> usize { + Wtf8::adjust_position_bwd(bytes, range) + } + + fn advance_range_start(bytes: &[u8]) -> usize { + Wtf8::advance_range_start(bytes) + } + + fn advance_range_end(bytes: &[u8]) -> usize { + Wtf8::advance_range_end(bytes) + } + + fn get_first_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let (_, chr, len) = Self::find_code_point_fwd(bytes)?; + Some((chr, len)) + } + + fn get_last_code_point(bytes: &[u8]) -> Option<(char, usize)> { + let (_, chr, len) = Self::find_code_point_bwd(bytes)?; + Some((chr, len)) + } + + fn find_code_point_fwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + // SAFETY: We’re Utf8 flavour. Client promises that bytes are + // well-formed UTF-8. We can not only assume well-formed byte + // sequence but also that produced code points are valid. + let chr = unsafe { char::from_u32_unchecked(next_code_point(&mut bytes.iter())?) }; + let len = chr.len_utf8(); + Some((0, chr, len)) + } + + fn find_code_point_bwd(bytes: &[u8]) -> Option<(usize, char, usize)> { + // SAFETY: We’re Utf8 flavour. Client promises that bytes are + // well-formed UTF-8. We can not only assume well-formed byte + // sequence but also that produced code points are valid. + let chr = unsafe { + let code = next_code_point_reverse(&mut bytes.iter().rev())?; + char::from_u32_unchecked(code) + }; + let len = chr.len_utf8(); + Some((bytes.len() - len, chr, len)) + } + } + + // Copied from src/chars/methods.rs. We need it because it’s not public + // there and char::len_utf8 requires us to have a char and we need this to + // work on surrogate code points as well. + #[inline] + const fn len_utf8(code: u32) -> usize { + if code < 0x80 { + 1 + } else if code < 0x800 { + 2 + } else if code < 0x10000 { + 3 + } else { + 4 + } + } +} + +trait SearchResult: crate::pattern::SearchResult { + /// Adjusts reject’s start position backwards to make sure it doesn’t fall + /// withing well-formed WTF-8 sequence. + /// + /// Doesn’t move the start position past `begin`. If position was adjusted, + /// updates `*out` as well. + fn adjust_reject_start_bwd( + self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self; + + /// Adjusts reject’s end position forwards to make sure it doesn’t fall + /// withing well-formed WTF-8 sequence. + /// + /// Doesn’t move the end position past `len`. If position was adjusted, + /// updates `*out` as well. + fn adjust_reject_end_fwd( + self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self; +} + +impl SearchResult for SearchStep { + fn adjust_reject_start_bwd( + mut self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self { + if let SearchStep::Reject(ref mut start, _) = self { + *start = bytes.adjust_position_bwd(begin..*start); + *out = *start; + } + self + } + fn adjust_reject_end_fwd( + mut self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self { + if let SearchStep::Reject(_, ref mut end) = self { + *end = bytes.adjust_position_fwd(*end..len); + *out = *end; + } + self + } +} + +impl SearchResult for MatchOnly { + fn adjust_reject_start_bwd( + self, + _bytes: Bytes<'_, F>, + _begin: usize, + _out: &mut usize, + ) -> Self { + self + } + fn adjust_reject_end_fwd( + self, + _bytes: Bytes<'_, F>, + _end: usize, + _out: &mut usize, + ) -> Self { + self + } +} + +impl SearchResult for RejectOnly { + fn adjust_reject_start_bwd( + mut self, + bytes: Bytes<'_, F>, + begin: usize, + out: &mut usize, + ) -> Self { + if let RejectOnly(Some((ref mut start, _))) = self { + *start = bytes.adjust_position_bwd(begin..*start); + *out = *start; + } + self + } + fn adjust_reject_end_fwd( + mut self, + bytes: Bytes<'_, F>, + len: usize, + out: &mut usize, + ) -> Self { + if let RejectOnly(Some((_, ref mut end))) = self { + *end = bytes.adjust_position_fwd(*end..len); + *out = *end; + } + self + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl for Haystack +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, F: Flavour> Haystack for Bytes<'hs, F> { + type Cursor = usize; + + fn cursor_at_front(self) -> Self::Cursor { + 0 + } + fn cursor_at_back(self) -> Self::Cursor { + self.0.len() + } + fn is_empty(self) -> bool { + self.0.is_empty() + } + + unsafe fn get_unchecked(self, range: Range) -> Self { + Self( + if cfg!(debug_assertions) { + self.0.get(range).unwrap() + } else { + // SAFETY: Caller promises cursor is a valid split position. + unsafe { self.0.get_unchecked(range) } + }, + PhantomData, + ) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for char +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, F: Flavour> Pattern> for char { + type Searcher = CharSearcher<'hs, F>; + + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: Bytes<'hs, F>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_contained_in(haystack) + } + + fn is_prefix_of(self, haystack: Bytes<'hs, F>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_prefix_of(haystack) + } + fn strip_prefix_of(self, haystack: Bytes<'hs, F>) -> Option> { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).strip_prefix_of(haystack) + } + + fn is_suffix_of(self, haystack: Bytes<'hs, F>) -> bool { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).is_suffix_of(haystack) + } + fn strip_suffix_of(self, haystack: Bytes<'hs, F>) -> Option> { + let mut buf = [0; 4]; + encode_utf8(self, &mut buf).strip_suffix_of(haystack) + } +} + +/// Like `chr.encode_utf8(&mut buf)` but casts result to `&str`. +/// +/// This is useful because we have Pattern impl for &str but not for &mut str. +fn encode_utf8(chr: char, buf: &mut [u8; 4]) -> &str { + chr.encode_utf8(buf) +} + +#[derive(Clone, Debug)] +pub struct CharSearcher<'hs, F> { + haystack: Bytes<'hs, F>, + state: CharSearcherState, +} + +#[derive(Clone, Debug)] +struct CharSearcherState { + /// Not yet processed range of the haystack. + range: crate::ops::Range, + /// Needle the searcher is looking for within the haystack. + needle: CharBuffer, + /// If `true` and `range` is non-empty, `haystack[range]` starts with the + /// needle. + is_match_fwd: bool, + /// If `true` and `range` is non-empty, `haystack[range]` ends with the + /// needle. + is_match_bwd: bool, +} + +impl<'hs, F: Flavour> CharSearcher<'hs, F> { + #[inline] + pub fn new(haystack: Bytes<'hs, F>, chr: char) -> Self { + Self { haystack, state: CharSearcherState::new(haystack.len(), chr) } + } +} + +unsafe impl<'hs, F: Flavour> pattern::Searcher> for CharSearcher<'hs, F> { + fn haystack(&self) -> Bytes<'hs, F> { + self.haystack + } + + fn next(&mut self) -> SearchStep { + self.state.next_fwd(self.haystack) + } + fn next_match(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } + fn next_reject(&mut self) -> OptRange { + self.state.next_fwd::(self.haystack).0 + } +} + +unsafe impl<'hs, F: Flavour> pattern::ReverseSearcher> for CharSearcher<'hs, F> { + fn next_back(&mut self) -> SearchStep { + self.state.next_bwd(self.haystack) + } + fn next_match_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } + fn next_reject_back(&mut self) -> OptRange { + self.state.next_bwd::(self.haystack).0 + } +} + +impl<'hs, F: Flavour> pattern::DoubleEndedSearcher> for CharSearcher<'hs, F> {} + +impl CharSearcherState { + fn new(haystack_len: usize, chr: char) -> Self { + Self { + range: 0..haystack_len, + needle: CharBuffer::new(chr), + is_match_fwd: false, + is_match_bwd: false, + } + } + + fn find_match_fwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { + let start = if take(&mut self.is_match_fwd) { + (!self.range.is_empty()).then_some(self.range.start) + } else { + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + // SAFETY: self.needle encodes a single character. + unsafe { naive::find_match_fwd(bytes.as_bytes(), self.needle.as_str()) } + .map(|pos| pos + self.range.start) + }?; + Some((start, start + self.needle.len())) + } + + fn next_reject_fwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { + if take(&mut self.is_match_fwd) { + if self.range.is_empty() { + return None; + } + self.range.start += self.needle.len() + } + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + if let Some(pos) = naive::find_reject_fwd(bytes.as_bytes(), self.needle.as_str()) { + let pos = pos + self.range.start; + let end = haystack.advance_range_start(pos..self.range.end); + self.range.start = end; + Some((pos, end)) + } else { + self.range.start = self.range.end; + None + } + } + + fn next_fwd(&mut self, haystack: Bytes<'_, F>) -> R { + if R::USE_EARLY_REJECT { + match self.next_reject_fwd(haystack) { + Some((start, end)) => R::rejecting(start, end).unwrap(), + None => R::DONE, + } + } else if let Some((start, end)) = self.find_match_fwd(haystack) { + if self.range.start < start { + if let Some(res) = R::rejecting(self.range.start, start) { + self.range.start = start; + self.is_match_fwd = true; + return res; + } + } + self.range.start = end; + R::matching(start, end).unwrap() + } else if self.range.is_empty() { + R::DONE + } else { + let start = self.range.start; + self.range.start = self.range.end; + R::rejecting(start, self.range.end).unwrap_or(R::DONE) + } + } + + fn find_match_bwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { + let start = if take(&mut self.is_match_bwd) { + (!self.range.is_empty()).then(|| self.range.end - self.needle.len()) + } else { + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + // SAFETY: self.needle encodes a single character. + unsafe { naive::find_match_bwd(bytes.as_bytes(), self.needle.as_str()) } + .map(|pos| pos + self.range.start) + }?; + Some((start, start + self.needle.len())) + } + + fn next_reject_bwd(&mut self, haystack: Bytes<'_, F>) -> OptRange { + if take(&mut self.is_match_bwd) { + if self.range.is_empty() { + return None; + } + self.range.end -= self.needle.len(); + } + // SAFETY: self.range is valid range of haystack. + let bytes = unsafe { haystack.get_unchecked(self.range.clone()) }; + if let Some(end) = naive::find_reject_bwd(bytes.as_bytes(), self.needle.as_str()) { + let end = end + self.range.start; + let start = haystack.advance_range_end(self.range.start..end); + self.range.end = start; + Some((start, end)) + } else { + self.range.end = self.range.start; + None + } + } + + fn next_bwd(&mut self, haystack: Bytes<'_, F>) -> R { + if R::USE_EARLY_REJECT { + match self.next_reject_bwd(haystack) { + Some((start, end)) => R::rejecting(start, end).unwrap(), + None => R::DONE, + } + } else if let Some((start, end)) = self.find_match_bwd(haystack) { + if end < self.range.end { + if let Some(res) = R::rejecting(end, self.range.end) { + self.range.end = end; + self.is_match_bwd = true; + return res; + } + } + self.range.end = start; + R::matching(start, end).unwrap() + } else if self.range.is_empty() { + R::DONE + } else { + let end = self.range.end; + self.range.end = self.range.start; + R::rejecting(self.range.start, end).unwrap_or(R::DONE) + } + } +} + +#[derive(Clone, Debug)] +struct CharBuffer([u8; 4], crate::num::NonZeroU8); + +impl CharBuffer { + fn new(chr: char) -> Self { + let mut buf = [0; 4]; + let len = chr.encode_utf8(&mut buf).len(); + // SAFETY: `len` is length of a single character UTF-8 sequence. + let len = unsafe { crate::num::NonZeroU8::new_unchecked(len as u8) }; + Self(buf, len) + } + + fn len(&self) -> usize { + usize::from(self.1.get()) + } + + fn as_str(&self) -> &str { + // SAFETY: `self.0` is UTF-8 encoding of a single character and `self.1` + // is its length. See `new` constructor. + unsafe { crate::str::from_utf8_unchecked(self.0.get_unchecked(..self.len())) } + } +} + +mod naive { + use crate::slice::memchr; + + /// Looks forwards for the next position of needle within haystack. + /// + /// Safety: `needle` must consist of a single character. + pub(super) unsafe fn find_match_fwd(haystack: &[u8], needle: &str) -> Option { + debug_assert!(!needle.is_empty()); + // SAFETY: Caller promises needle is non-empty. + let (&last_byte, head) = unsafe { needle.as_bytes().split_last().unwrap_unchecked() }; + let mut start = 0; + while haystack.len() - start > head.len() { + // SAFETY: + // 1. `start` is initialised to `self.start` and only ever increased + // thus `self.start ≤ start`. + // 2. We've checked `start + head.len() < haystack.len()`. + let bytes = unsafe { haystack.get_unchecked(start + head.len()..) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // `start + index + head.len()` is the index of the last byte + // thus `start + index` is the index of the first byte. + let pos = start + index; + // SAFETY: Since we’ve started our search with head.len() + // offset, we know we have at least head.len() bytes in buffer. + if unsafe { haystack.get_unchecked(pos..pos + head.len()) } == head { + return Some(pos); + } + start += index + 1; + } else { + break; + } + } + None + } + + /// Looks backwards for the next position of needle within haystack. + /// + /// Safety: `needle` must consist of a single character. + pub(super) unsafe fn find_match_bwd(haystack: &[u8], needle: &str) -> Option { + // SAFETY: Caller promises needle is non-empty. + let (&first_byte, tail) = unsafe { needle.as_bytes().split_first().unwrap_unchecked() }; + let mut end = haystack.len(); + while end > tail.len() { + // SAFETY: + // 1. `end` is initialised to `haystack.len()` and only ever + // decreased thus `end ≤ haystack.len()`. + // 2. We've checked `end > tail.len()`. + let bytes = unsafe { haystack.get_unchecked(..end - tail.len()) }; + if let Some(pos) = memchr::memrchr(first_byte, bytes) { + // SAFETY: Since we’ve stopped our search with tail.len() + // offset, we know we have at least tail.len() bytes in buffer + // after position of the byte we’ve found. + if unsafe { haystack.get_unchecked(pos + 1..pos + 1 + tail.len()) } == tail { + return Some(pos); + } + end = pos; + } else { + break; + } + } + None + } + + /// Looks forwards for the next position where needle stops matching. + /// + /// Returns start of the next reject or `None` if there is no reject. + pub(super) fn find_reject_fwd(haystack: &[u8], needle: &str) -> Option { + let count = + haystack.chunks(needle.len()).take_while(|&slice| slice == needle.as_bytes()).count(); + let start = count * needle.len(); + (start < haystack.len()).then_some(start) + } + + /// Looks backwards for the next position where needle stops matching. + /// + /// Returns end of the next reject or `None` if there is no reject. + pub(super) fn find_reject_bwd(haystack: &[u8], needle: &str) -> Option { + debug_assert!(!needle.is_empty()); + let count = + haystack.rchunks(needle.len()).take_while(|&slice| slice == needle.as_bytes()).count(); + let end = haystack.len() - count * needle.len(); + (end > 0).then_some(end) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for FnMut(char) and FnMut(Result) +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, F: Flavour, P: FnMut(char) -> bool> Pattern> for P { + type Searcher = PredicateSearcher<'hs, F, P>; + + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_prefix_of(mut self, haystack: Bytes<'hs, F>) -> bool { + haystack.get_first_code_point().map_or(false, |(chr, _)| self(chr)) + } + fn strip_prefix_of(mut self, haystack: Bytes<'hs, F>) -> Option> { + let (chr, len) = haystack.get_first_code_point()?; + // SAFETY: We’ve just checked slice starts with len-byte long + // well-formed sequence. + self(chr).then(|| unsafe { haystack.get_unchecked(len..haystack.len()) }) + } + + fn is_suffix_of(mut self, haystack: Bytes<'hs, F>) -> bool { + haystack.get_last_code_point().map_or(false, |(chr, _)| self(chr)) + } + fn strip_suffix_of(mut self, haystack: Bytes<'hs, F>) -> Option> { + let (chr, len) = haystack.get_last_code_point()?; + let len = haystack.len() - len; + // SAFETY: We’ve just checked slice ends with len-byte long well-formed + // sequence. + self(chr).then(|| unsafe { haystack.get_unchecked(0..len) }) + } +} + +#[derive(Clone, Debug)] +pub struct PredicateSearcher<'hs, F, P> { + haystack: Bytes<'hs, F>, + pred: P, + start: usize, + end: usize, + fwd_match_len: u8, + bwd_match_len: u8, +} + +impl<'hs, F: Flavour, P> PredicateSearcher<'hs, F, P> { + #[inline] + pub fn new(haystack: Bytes<'hs, F>, pred: P) -> Self { + Self { haystack, pred, start: 0, end: haystack.len(), fwd_match_len: 0, bwd_match_len: 0 } + } +} + +impl<'hs, F: Flavour, P: FnMut(char) -> bool> PredicateSearcher<'hs, F, P> { + fn find_match_fwd(&mut self) -> Option<(usize, usize)> { + let mut start = self.start; + while start < self.end { + let (idx, chr, len) = self.haystack.find_code_point_fwd(start..self.end)?; + if (self.pred)(chr) { + return Some((idx, len)); + } + start = idx + len; + } + None + } + + fn find_match_bwd(&mut self) -> Option<(usize, usize)> { + let mut end = self.end; + while self.start < end { + let (idx, chr, len) = self.haystack.find_code_point_bwd(self.start..end)?; + if (self.pred)(chr) { + return Some((idx, len)); + } + end = idx; + } + None + } + + fn next_fwd(&mut self) -> R { + while self.start < self.end { + if self.fwd_match_len == 0 { + let (pos, len) = self.find_match_fwd().unwrap_or((self.end, 0)); + self.fwd_match_len = len as u8; + if pos != self.start { + let start = self.start; + self.start = pos; + if let Some(ret) = R::rejecting(start, pos) { + return ret; + } else if pos >= self.end { + break; + } + } + } + + let pos = self.start; + self.start += usize::from(take(&mut self.fwd_match_len)); + if let Some(ret) = R::matching(pos, self.start) { + return ret; + } + } + R::DONE + } + + fn next_bwd(&mut self) -> R { + while self.start < self.end { + if self.bwd_match_len == 0 { + let (pos, len) = self.find_match_bwd().unwrap_or((self.start, 0)); + self.bwd_match_len = len as u8; + let pos = pos + len; + let end = self.end; + if pos != self.end { + self.end = pos; + if let Some(ret) = R::rejecting(pos, end) { + return ret; + } else if self.start >= self.end { + break; + } + } + } + + let end = self.end; + self.end -= usize::from(take(&mut self.bwd_match_len)); + if let Some(ret) = R::matching(self.end, end) { + return ret; + } + } + R::DONE + } +} + +unsafe impl<'hs, F, P> pattern::Searcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, +{ + fn haystack(&self) -> Bytes<'hs, F> { + self.haystack + } + fn next(&mut self) -> SearchStep { + self.next_fwd() + } + fn next_match(&mut self) -> OptRange { + self.next_fwd::().0 + } + fn next_reject(&mut self) -> OptRange { + self.next_fwd::().0 + } +} + +unsafe impl<'hs, F, P> pattern::ReverseSearcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, +{ + fn next_back(&mut self) -> SearchStep { + self.next_bwd() + } + fn next_match_back(&mut self) -> OptRange { + self.next_bwd::().0 + } + fn next_reject_back(&mut self) -> OptRange { + self.next_bwd::().0 + } +} + +impl<'hs, F, P> pattern::DoubleEndedSearcher> for PredicateSearcher<'hs, F, P> +where + F: Flavour, + P: FnMut(char) -> bool, +{ +} + +//////////////////////////////////////////////////////////////////////////////// +// Impl Pattern for &str +//////////////////////////////////////////////////////////////////////////////// + +impl<'hs, 'p, F: Flavour> Pattern> for &'p str { + type Searcher = StrSearcher<'hs, 'p, F>; + + fn into_searcher(self, haystack: Bytes<'hs, F>) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_prefix_of(self, haystack: Bytes<'hs, F>) -> bool { + haystack.as_bytes().starts_with(self.as_bytes()) + } + fn strip_prefix_of(self, haystack: Bytes<'hs, F>) -> Option> { + haystack.as_bytes().strip_prefix(self.as_bytes()).map(|bytes| Bytes(bytes, PhantomData)) + } + + fn is_suffix_of(self, haystack: Bytes<'hs, F>) -> bool { + haystack.as_bytes().ends_with(self.as_bytes()) + } + fn strip_suffix_of(self, haystack: Bytes<'hs, F>) -> Option> { + haystack.as_bytes().strip_suffix(self.as_bytes()).map(|bytes| Bytes(bytes, PhantomData)) + } +} + +#[derive(Clone, Debug)] +pub struct StrSearcher<'hs, 'p, F> { + haystack: Bytes<'hs, F>, + inner: StrSearcherInner<'p>, +} + +impl<'hs, 'p, F: Flavour> StrSearcher<'hs, 'p, F> { + pub fn new(haystack: Bytes<'hs, F>, needle: &'p str) -> Self { + let inner = StrSearcherInner::new(haystack, needle); + Self { haystack, inner } + } +} + +unsafe impl<'hs, 'p, F: Flavour> pattern::Searcher> for StrSearcher<'hs, 'p, F> { + fn haystack(&self) -> Bytes<'hs, F> { + self.haystack + } + fn next(&mut self) -> SearchStep { + self.inner.next_fwd(self.haystack) + } + fn next_match(&mut self) -> OptRange { + self.inner.next_fwd::(self.haystack).0 + } + fn next_reject(&mut self) -> OptRange { + self.inner.next_fwd::(self.haystack).0 + } +} + +unsafe impl<'hs, 'p, F: Flavour> pattern::ReverseSearcher> + for StrSearcher<'hs, 'p, F> +{ + fn next_back(&mut self) -> SearchStep { + self.inner.next_bwd(self.haystack) + } + fn next_match_back(&mut self) -> OptRange { + self.inner.next_bwd::(self.haystack).0 + } + fn next_reject_back(&mut self) -> OptRange { + self.inner.next_bwd::(self.haystack).0 + } +} + +#[derive(Clone, Debug)] +enum StrSearcherInner<'p> { + Empty(EmptySearcherState), + Char(CharSearcherState), + Str(StrSearcherState<'p>), +} + +impl<'p> StrSearcherInner<'p> { + fn new(haystack: Bytes<'_, F>, needle: &'p str) -> Self { + let mut chars = needle.chars(); + let chr = match chars.next() { + Some(chr) => chr, + None => return Self::Empty(EmptySearcherState::new(haystack)), + }; + if chars.next().is_none() { + Self::Char(CharSearcherState::new(haystack.len(), chr)) + } else { + Self::Str(StrSearcherState::new(haystack, needle)) + } + } + + fn next_fwd(&mut self, haystack: Bytes<'_, F>) -> R { + match self { + Self::Empty(state) => state.next_fwd::(haystack), + Self::Char(state) => state.next_fwd::(haystack), + Self::Str(state) => state.next_fwd::(haystack), + } + } + + fn next_bwd(&mut self, haystack: Bytes<'_, F>) -> R { + match self { + Self::Empty(state) => state.next_bwd::(haystack), + Self::Char(state) => state.next_bwd::(haystack), + Self::Str(state) => state.next_bwd::(haystack), + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Empty needle searching +//////////////////////////////////////////////////////////////////////////////// + +/// Empty needle rejects every character and matches every character boundary. +/// +/// A character is either a well-formed WTF-8 bytes sequence or a single byte +/// whichever is longer. +#[derive(Clone, Debug)] +struct EmptySearcherState(pattern::EmptyNeedleSearcher); + +impl EmptySearcherState { + fn new(haystack: Bytes<'_, F>) -> Self { + Self(pattern::EmptyNeedleSearcher::new(haystack)) + } + + fn next_fwd(&mut self, bytes: Bytes<'_, F>) -> R { + self.0.next_fwd(|range| bytes.advance_range_start(range)) + } + + fn next_bwd(&mut self, bytes: Bytes<'_, F>) -> R { + self.0.next_bwd(|range| bytes.advance_range_end(range)) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Full substring search +//////////////////////////////////////////////////////////////////////////////// + +/// A substring search. +#[derive(Clone, Debug)] +struct StrSearcherState<'p> { + needle: &'p str, + searcher: TwoWaySearcher, +} + +impl<'p> StrSearcherState<'p> { + fn new(haystack: Bytes<'_, F>, needle: &'p str) -> Self { + let searcher = TwoWaySearcher::new(haystack.len(), needle.as_bytes()); + Self { needle, searcher } + } + + fn next_fwd(&mut self, bytes: Bytes<'_, F>) -> R { + if self.searcher.position >= bytes.len() { + return R::DONE; + } + if self.searcher.memory == usize::MAX { + self.searcher.next_fwd::(bytes.0, self.needle.as_bytes(), true) + } else { + self.searcher.next_fwd::(bytes.0, self.needle.as_bytes(), false) + } + .adjust_reject_end_fwd(bytes, bytes.len(), &mut self.searcher.position) + } + + fn next_bwd(&mut self, bytes: Bytes<'_, F>) -> R { + if self.searcher.end == 0 { + return R::DONE; + } + if self.searcher.memory == usize::MAX { + self.searcher.next_bwd::(bytes.0, self.needle.as_bytes(), true) + } else { + self.searcher.next_bwd::(bytes.0, self.needle.as_bytes(), false) + } + .adjust_reject_start_bwd(bytes, 0, &mut self.searcher.end) + } +} + +/// The internal state of the two-way substring search algorithm. +#[derive(Clone, Debug)] +struct TwoWaySearcher { + // constants + /// critical factorization index + crit_pos: usize, + /// critical factorization index for reversed needle + crit_pos_back: usize, + period: usize, + /// `byteset` is an extension (not part of the two way algorithm); + /// it's a 64-bit "fingerprint" where each set bit `j` corresponds + /// to a (byte & 63) == j present in the needle. + byteset: u64, + + // variables + position: usize, + end: usize, + /// index into needle before which we have already matched + memory: usize, + /// index into needle after which we have already matched + memory_back: usize, +} + +/* + This is the Two-Way search algorithm, which was introduced in the paper: + Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. + + Here's some background information. + + A *word* is a string of symbols. The *length* of a word should be a familiar + notion, and here we denote it for any word x by |x|. + (We also allow for the possibility of the *empty word*, a word of length zero). + + If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a + *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. + For example, both 1 and 2 are periods for the string "aa". As another example, + the only period of the string "abcd" is 4. + + We denote by period(x) the *smallest* period of x (provided that x is non-empty). + This is always well-defined since every non-empty word x has at least one period, + |x|. We sometimes call this *the period* of x. + + If u, v and x are words such that x = uv, where uv is the concatenation of u and + v, then we say that (u, v) is a *factorization* of x. + + Let (u, v) be a factorization for a word x. Then if w is a non-empty word such + that both of the following hold + + - either w is a suffix of u or u is a suffix of w + - either w is a prefix of v or v is a prefix of w + + then w is said to be a *repetition* for the factorization (u, v). + + Just to unpack this, there are four possibilities here. Let w = "abc". Then we + might have: + + - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") + - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") + - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") + - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") + + Note that the word vu is a repetition for any factorization (u,v) of x = uv, + so every factorization has at least one repetition. + + If x is a string and (u, v) is a factorization for x, then a *local period* for + (u, v) is an integer r such that there is some word w such that |w| = r and w is + a repetition for (u, v). + + We denote by local_period(u, v) the smallest local period of (u, v). We sometimes + call this *the local period* of (u, v). Provided that x = uv is non-empty, this + is well-defined (because each non-empty word has at least one factorization, as + noted above). + + It can be proven that the following is an equivalent definition of a local period + for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for + all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are + defined. (i.e., i > 0 and i + r < |x|). + + Using the above reformulation, it is easy to prove that + + 1 <= local_period(u, v) <= period(uv) + + A factorization (u, v) of x such that local_period(u,v) = period(x) is called a + *critical factorization*. + + The algorithm hinges on the following theorem, which is stated without proof: + + **Critical Factorization Theorem** Any word x has at least one critical + factorization (u, v) such that |u| < period(x). + + The purpose of maximal_suffix is to find such a critical factorization. + + If the period is short, compute another factorization x = u' v' to use + for reverse search, chosen instead so that |v'| < period(x). + +*/ +impl TwoWaySearcher { + fn new(haystack_len: usize, needle: &[u8]) -> TwoWaySearcher { + let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); + let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); + + let (crit_pos, period) = if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if needle[..crit_pos] == needle[period..period + crit_pos] { + // short period case -- the period is exact + // compute a separate critical factorization for the reversed needle + // x = u' v' where |v'| < period(x). + // + // This is sped up by the period being known already. + // Note that a case like x = "acba" may be factored exactly forwards + // (crit_pos = 1, period = 3) while being factored with approximate + // period in reverse (crit_pos = 2, period = 2). We use the given + // reverse factorization but keep the exact period. + let crit_pos_back = needle.len() + - cmp::max( + TwoWaySearcher::reverse_maximal_suffix(needle, period, false), + TwoWaySearcher::reverse_maximal_suffix(needle, period, true), + ); + + TwoWaySearcher { + crit_pos, + crit_pos_back, + period, + byteset: Self::byteset_create(&needle[..period]), + + position: 0, + end: haystack_len, + memory: 0, + memory_back: needle.len(), + } + } else { + // long period case -- we have an approximation to the actual period, + // and don't use memorization. + // + // Approximate the period by lower bound max(|u|, |v|) + 1. + // The critical factorization is efficient to use for both forward and + // reverse search. + + TwoWaySearcher { + crit_pos, + crit_pos_back: crit_pos, + period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, + byteset: Self::byteset_create(needle), + + position: 0, + end: haystack_len, + memory: usize::MAX, // Dummy value to signify that the period is long + memory_back: usize::MAX, + } + } + } + + #[inline] + fn byteset_create(bytes: &[u8]) -> u64 { + bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a) + } + + #[inline] + fn byteset_contains(&self, byte: u8) -> bool { + (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 + } + + // One of the main ideas of Two-Way is that we factorize the needle into + // two halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + #[inline] + fn next_fwd( + &mut self, + haystack: &[u8], + needle: &[u8], + long_period: bool, + ) -> R { + // `next()` uses `self.position` as its cursor + let old_pos = self.position; + let needle_last = needle.len() - 1; + 'search: loop { + // Check that we have room to search in + // position + needle_last can not overflow if we assume slices + // are bounded by isize's range. + let tail_byte = match haystack.get(self.position + needle_last) { + Some(&b) => b, + None => { + self.position = haystack.len(); + return R::rejecting(old_pos, self.position).unwrap_or(R::DONE); + } + }; + + if old_pos != self.position { + if let Some(ret) = R::rejecting(old_pos, self.position) { + return ret; + } + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(tail_byte) { + self.position += needle.len(); + if !long_period { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = + if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) }; + for i in start..needle.len() { + if needle[i] != haystack[self.position + i] { + self.position += i - self.crit_pos + 1; + if !long_period { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if long_period { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if needle[i] != haystack[self.position + i] { + self.position += self.period; + if !long_period { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.position; + + // Note: add self.period instead of needle.len() to have overlapping matches + self.position += needle.len(); + if !long_period { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + + if let Some(ret) = R::matching(match_pos, match_pos + needle.len()) { + return ret; + } + } + } + + // Follows the ideas in `next()`. + // + // The definitions are symmetrical, with period(x) = period(reverse(x)) + // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) + // is a critical factorization, so is (reverse(v), reverse(u)). + // + // For the reverse case we have computed a critical factorization x = u' v' + // (field `crit_pos_back`). We need |u| < period(x) for the forward case and + // thus |v'| < period(x) for the reverse. + // + // To search in reverse through the haystack, we search forward through + // a reversed haystack with a reversed needle, matching first u' and then v'. + #[inline] + fn next_bwd( + &mut self, + haystack: &[u8], + needle: &[u8], + long_period: bool, + ) -> R { + // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` + // are independent. + let old_end = self.end; + 'search: loop { + // Check that we have room to search in + // end - needle.len() will wrap around when there is no more room, + // but due to slice length limits it can never wrap all the way back + // into the length of haystack. + let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) { + Some(&b) => b, + None => { + self.end = 0; + return R::rejecting(0, old_end).unwrap_or(R::DONE); + } + }; + + if old_end != self.end { + if let Some(ret) = R::rejecting(self.end, old_end) { + return ret; + } + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(front_byte) { + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + + // See if the left part of the needle matches + let crit = if long_period { + self.crit_pos_back + } else { + cmp::min(self.crit_pos_back, self.memory_back) + }; + for i in (0..crit).rev() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.crit_pos_back - i; + if !long_period { + self.memory_back = needle.len(); + } + continue 'search; + } + } + + // See if the right part of the needle matches + let needle_end = if long_period { needle.len() } else { self.memory_back }; + for i in self.crit_pos_back..needle_end { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.period; + if !long_period { + self.memory_back = self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.end - needle.len(); + // Note: sub self.period instead of needle.len() to have overlapping matches + self.end -= needle.len(); + if !long_period { + self.memory_back = needle.len(); + } + + if let Some(ret) = R::matching(match_pos, match_pos + needle.len()) { + return ret; + } + } + } + + // Compute the maximal suffix of `arr`. + // + // The maximal suffix is a possible critical factorization (u, v) of `arr`. + // + // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the + // period of v. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + #[inline] + fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + + while let Some(&a) = arr.get(right + offset) { + // `left` will be inbounds when `right` is. + let b = arr[left + offset]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + } + (left, period) + } + + // Compute the maximal suffix of the reverse of `arr`. + // + // The maximal suffix is a possible critical factorization (u', v') of `arr`. + // + // Returns `i` where `i` is the starting index of v', from the back; + // returns immediately when a period of `known_period` is reached. + // + // `order_greater` determines if lexical order is `<` or `>`. Both + // orders must be computed -- the ordering with the largest `i` gives + // a critical factorization. + // + // For long period cases, the resulting period is not exact (it is too short). + fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize { + let mut left = 0; // Corresponds to i in the paper + let mut right = 1; // Corresponds to j in the paper + let mut offset = 0; // Corresponds to k in the paper, but starting at 0 + // to match 0-based indexing. + let mut period = 1; // Corresponds to p in the paper + let n = arr.len(); + + while right + offset < n { + let a = arr[n - (1 + right + offset)]; + let b = arr[n - (1 + left + offset)]; + if (a < b && !order_greater) || (a > b && order_greater) { + // Suffix is smaller, period is entire prefix so far. + right += offset + 1; + offset = 0; + period = right - left; + } else if a == b { + // Advance through repetition of the current period. + if offset + 1 == period { + right += offset + 1; + offset = 0; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 0; + period = 1; + } + if period == known_period { + break; + } + } + debug_assert!(period <= known_period); + left + } +} diff --git a/library/core/tests/pattern.rs b/library/core/tests/pattern.rs index d4bec996d89a1..e92c30bbdd735 100644 --- a/library/core/tests/pattern.rs +++ b/library/core/tests/pattern.rs @@ -1,12 +1,12 @@ -use std::str::pattern::*; +use std::pattern::*; // This macro makes it easier to write // tests that do a series of iterations macro_rules! search_asserts { - ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { + ($haystack:expr, $needle:expr, $testname:literal, $($func:ident => $result:expr),*) => { let mut searcher = $needle.into_searcher($haystack); - let arr = [$( Step::from(searcher.$func()) ),*]; - assert_eq!(&arr[..], &$result, $testname); + let arr = [$( searcher.$func().into_step(stringify!($func)) ),*]; + assert_eq!(&arr[..], &[$($result),*], $testname); } } @@ -17,26 +17,31 @@ enum Step { // be the same length for easy alignment Matches(usize, usize), Rejects(usize, usize), - InRange(usize, usize), Done, } -use self::Step::*; +use Step::*; -impl From for Step { - fn from(x: SearchStep) -> Self { - match x { - SearchStep::Match(a, b) => Matches(a, b), - SearchStep::Reject(a, b) => Rejects(a, b), +trait IntoStep { + fn into_step(self, method_name: &str) -> Step; +} + +impl IntoStep for SearchStep { + fn into_step(self, _name: &str) -> Step { + match self { + SearchStep::Match(s, e) => Matches(s, e), + SearchStep::Reject(s, e) => Rejects(s, e), SearchStep::Done => Done, } } } -impl From> for Step { - fn from(x: Option<(usize, usize)>) -> Self { - match x { - Some((a, b)) => InRange(a, b), +impl IntoStep for Option<(usize, usize)> { + fn into_step(self, method_name: &str) -> Step { + let is_reject = method_name.starts_with("next_reject"); + match self { + Some((s, e)) if is_reject => Rejects(s, e), + Some((s, e)) => Matches(s, e), None => Done, } } @@ -54,93 +59,53 @@ fn test_simple_iteration() { "abcdeabcd", 'a', "forward iteration for ASCII string", - // a b c d e a b c d EOF - [next, next, next, next, next, next, next, next, next, next], - [ - Matches(0, 1), - Rejects(1, 2), - Rejects(2, 3), - Rejects(3, 4), - Rejects(4, 5), - Matches(5, 6), - Rejects(6, 7), - Rejects(7, 8), - Rejects(8, 9), - Done - ] + next => Matches(0, 1), + next => Rejects(1, 5), + next => Matches(5, 6), + next => Rejects(6, 9), + next => Done ); search_asserts!( "abcdeabcd", 'a', "reverse iteration for ASCII string", - // d c b a e d c b a EOF - [ - next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, - next_back, next_back - ], - [ - Rejects(8, 9), - Rejects(7, 8), - Rejects(6, 7), - Matches(5, 6), - Rejects(4, 5), - Rejects(3, 4), - Rejects(2, 3), - Rejects(1, 2), - Matches(0, 1), - Done - ] + next_back => Rejects(6, 9), + next_back => Matches(5, 6), + next_back => Rejects(1, 5), + next_back => Matches(0, 1), + next_back => Done ); search_asserts!( "我爱我的猫", '我', "forward iteration for Chinese string", - // 我 愛 我 的 貓 EOF - [next, next, next, next, next, next], - [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] + next => Matches(0, 3), + next => Rejects(3, 6), + next => Matches(6, 9), + next => Rejects(9, 15), + next => Done ); search_asserts!( "我的猫说meow", 'm', "forward iteration for mixed string", - // 我 的 猫 说 m e o w EOF - [next, next, next, next, next, next, next, next, next], - [ - Rejects(0, 3), - Rejects(3, 6), - Rejects(6, 9), - Rejects(9, 12), - Matches(12, 13), - Rejects(13, 14), - Rejects(14, 15), - Rejects(15, 16), - Done - ] + next => Rejects(0, 12), + next => Matches(12, 13), + next => Rejects(13, 16), + next => Done ); search_asserts!( "我的猫说meow", '猫', "reverse iteration for mixed string", - // w o e m 说 猫 的 我 EOF - [ - next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, - next_back - ], - [ - Rejects(15, 16), - Rejects(14, 15), - Rejects(13, 14), - Rejects(12, 13), - Rejects(9, 12), - Matches(6, 9), - Rejects(3, 6), - Rejects(0, 3), - Done - ] + next_back => Rejects(9, 16), + next_back => Matches(6, 9), + next_back => Rejects(0, 6), + next_back => Done ); } @@ -150,46 +115,43 @@ fn test_simple_search() { "abcdeabcdeabcde", 'a', "next_match for ASCII string", - [next_match, next_match, next_match, next_match], - [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] + next_match => Matches(0, 1), + next_match => Matches(5, 6), + next_match => Matches(10, 11), + next_match => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "next_match_back for ASCII string", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] + next_match_back => Matches(10, 11), + next_match_back => Matches(5, 6), + next_match_back => Matches(0, 1), + next_match_back => Done ); search_asserts!( "abcdeab", 'a', "next_reject for ASCII string", - [next_reject, next_reject, next_match, next_reject, next_reject], - [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] + next_reject => Rejects(1, 2), + next_reject => Rejects(2, 3), + next_match => Matches(5, 6), + next_reject => Rejects(6, 7), + next_reject => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", - [ - next_reject_back, - next_reject_back, - next_match_back, - next_reject_back, - next_reject_back, - next_reject_back - ], - [ - InRange(14, 15), - InRange(13, 14), - InRange(10, 11), - InRange(9, 10), - InRange(8, 9), - InRange(7, 8) - ] + next_reject_back => Rejects(14, 15), + next_reject_back => Rejects(13, 14), + next_match_back => Matches(10, 11), + next_reject_back => Rejects(9, 10), + next_reject_back => Rejects(8, 9), + next_reject_back => Rejects(7, 8) ); } @@ -207,38 +169,31 @@ const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; #[test] fn test_stress_indices() { // this isn't really a test, more of documentation on the indices of each character in the stresstest string - search_asserts!( STRESS, - 'x', + |_| true, "Indices of characters in stress test", - [ - next, next, next, next, next, next, next, next, next, next, next, next, next, next, - next, next, next, next, next, next, next - ], - [ - Rejects(0, 2), // Á - Rejects(2, 3), // a - Rejects(3, 7), // 🁀 - Rejects(7, 8), // b - Rejects(8, 10), // Á - Rejects(10, 13), // ꁁ - Rejects(13, 14), // f - Rejects(14, 15), // g - Rejects(15, 19), // 😀 - Rejects(19, 22), // 각 - Rejects(22, 25), // ก - Rejects(25, 28), // ᘀ - Rejects(28, 31), // 각 - Rejects(31, 32), // a - Rejects(32, 34), // Á - Rejects(34, 37), // 각 - Rejects(37, 40), // ꁁ - Rejects(40, 43), // ก - Rejects(43, 47), // 😀 - Rejects(47, 48), // a - Done - ] + next => Matches(0, 2), // Á + next => Matches(2, 3), // a + next => Matches(3, 7), // 🁀 + next => Matches(7, 8), // b + next => Matches(8, 10), // Á + next => Matches(10, 13), // ꁁ + next => Matches(13, 14), // f + next => Matches(14, 15), // g + next => Matches(15, 19), // 😀 + next => Matches(19, 22), // 각 + next => Matches(22, 25), // ก + next => Matches(25, 28), // ᘀ + next => Matches(28, 31), // 각 + next => Matches(31, 32), // a + next => Matches(32, 34), // Á + next => Matches(34, 37), // 각 + next => Matches(37, 40), // ꁁ + next => Matches(40, 43), // ก + next => Matches(43, 47), // 😀 + next => Matches(47, 48), // a + next => Done ); } @@ -248,96 +203,113 @@ fn test_forward_search_shared_bytes() { STRESS, 'Á', "Forward search for two-byte Latin character", - [next_match, next_match, next_match, next_match], - [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] + next_match => Matches(0, 2), + next_match => Matches(8, 10), + next_match => Matches(32, 34), + next_match => Done ); search_asserts!( STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [ - InRange(0, 2), - Rejects(2, 3), - InRange(8, 10), - Rejects(10, 13), - InRange(32, 34), - Rejects(34, 37), - Done - ] + next_match => Matches(0, 2), + next => Rejects(2, 8), + next_match => Matches(8, 10), + next => Rejects(10, 32), + next_match => Matches(32, 34), + next => Rejects(34, 48), + next_match => Done ); search_asserts!( STRESS, '각', "Forward search for three-byte Hangul character", - [next_match, next, next_match, next_match, next_match], - [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] + next_match => Matches(19, 22), + next => Rejects(22, 28), + next_match => Matches(28, 31), + next_match => Matches(34, 37), + next_match => Done ); search_asserts!( STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", - [next_match, next, next_match, next, next_match, next, next_match], - [ - InRange(19, 22), - Rejects(22, 25), - InRange(28, 31), - Rejects(31, 32), - InRange(34, 37), - Rejects(37, 40), - Done - ] + next_match => Matches(19, 22), + next => Rejects(22, 28), + next_match => Matches(28, 31), + next => Rejects(31, 34), + next_match => Matches(34, 37), + next => Rejects(37, 48), + next_match => Done ); search_asserts!( STRESS, 'ก', "Forward search for three-byte Thai character", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + next_match => Matches(22, 25), + next => Rejects(25, 40), + next_match => Matches(40, 43), + next => Rejects(43, 48), + next_match => Done ); search_asserts!( STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + next_match => Matches(22, 25), + next => Rejects(25, 40), + next_match => Matches(40, 43), + next => Rejects(43, 48), + next_match => Done ); search_asserts!( STRESS, '😁', "Forward search for four-byte emoji", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + next_match => Matches(15, 19), + next => Rejects(19, 43), + next_match => Matches(43, 47), + next => Rejects(47, 48), + next_match => Done ); search_asserts!( STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + next_match => Matches(15, 19), + next => Rejects(19, 43), + next_match => Matches(43, 47), + next => Rejects(47, 48), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + next_match => Matches(10, 13), + next => Rejects(13, 37), + next_match => Matches(37, 40), + next => Rejects(40, 48), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", - [next_match, next, next_match, next, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + next_match => Matches(10, 13), + next => Rejects(13, 37), + next_match => Matches(37, 40), + next => Rejects(40, 48), + next_match => Done ); } @@ -347,96 +319,112 @@ fn test_reverse_search_shared_bytes() { STRESS, 'Á', "Reverse search for two-byte Latin character", - [next_match_back, next_match_back, next_match_back, next_match_back], - [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] + next_match_back => Matches(32, 34), + next_match_back => Matches(8, 10), + next_match_back => Matches(0, 2), + next_match_back => Done ); search_asserts!( STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], - [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] + next_match_back => Matches(32, 34), + next_back => Rejects(10, 32), + next_match_back => Matches(8, 10), + next_back => Rejects(2, 8), + next_match_back => Matches(0, 2), + next_back => Done ); search_asserts!( STRESS, '각', "Reverse search for three-byte Hangul character", - [next_match_back, next_back, next_match_back, next_match_back, next_match_back], - [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] + next_match_back => Matches(34, 37), + next_back => Rejects(31, 34), + next_match_back => Matches(28, 31), + next_match_back => Matches(19, 22), + next_match_back => Done ); search_asserts!( STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", - [ - next_match_back, - next_back, - next_match_back, - next_back, - next_match_back, - next_back, - next_match_back - ], - [ - InRange(34, 37), - Rejects(32, 34), - InRange(28, 31), - Rejects(25, 28), - InRange(19, 22), - Rejects(15, 19), - Done - ] + next_match_back => Matches(34, 37), + next_back => Rejects(31, 34), + next_match_back => Matches(28, 31), + next_back => Rejects(22, 28), + next_match_back => Matches(19, 22), + next_back => Rejects(0, 19), + next_match_back => Done ); search_asserts!( STRESS, 'ก', "Reverse search for three-byte Thai character", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + next_match_back => Matches(40, 43), + next_back => Rejects(25, 40), + next_match_back => Matches(22, 25), + next_back => Rejects(0, 22), + next_match_back => Done ); search_asserts!( STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + next_match_back => Matches(40, 43), + next_back => Rejects(25, 40), + next_match_back => Matches(22, 25), + next_back => Rejects(0, 22), + next_match_back => Done ); search_asserts!( STRESS, '😁', "Reverse search for four-byte emoji", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + next_match_back => Matches(43, 47), + next_back => Rejects(19, 43), + next_match_back => Matches(15, 19), + next_back => Rejects(0, 15), + next_match_back => Done ); search_asserts!( STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + next_match_back => Matches(43, 47), + next_back => Rejects(19, 43), + next_match_back => Matches(15, 19), + next_back => Rejects(0, 15), + next_match_back => Done ); search_asserts!( STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + next_match_back => Matches(37, 40), + next_back => Rejects(13, 37), + next_match_back => Matches(10, 13), + next_back => Rejects(0, 10), + next_match_back => Done ); search_asserts!( STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", - [next_match_back, next_back, next_match_back, next_back, next_match_back], - [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + next_match_back => Matches(37, 40), + next_back => Rejects(13, 37), + next_match_back => Matches(10, 13), + next_back => Rejects(0, 10), + next_match_back => Done ); } @@ -448,56 +436,76 @@ fn double_ended_regression_test() { "abcdeabcdeabcde", 'a', "alternating double ended search", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] + next_match => Matches(0, 1), + next_match_back => Matches(10, 11), + next_match => Matches(5, 6), + next_match_back => Done ); search_asserts!( "abcdeabcdeabcde", 'a', "triple double ended search for a", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(0, 1), InRange(10, 11), InRange(5, 6), Done] + next_match => Matches(0, 1), + next_match_back => Matches(10, 11), + next_match_back => Matches(5, 6), + next_match_back => Done ); search_asserts!( "abcdeabcdeabcde", 'd', "triple double ended search for d", - [next_match, next_match_back, next_match_back, next_match_back], - [InRange(3, 4), InRange(13, 14), InRange(8, 9), Done] + next_match => Matches(3, 4), + next_match_back => Matches(13, 14), + next_match_back => Matches(8, 9), + next_match_back => Done ); search_asserts!( STRESS, 'Á', "Double ended search for two-byte Latin character", - [next_match, next_match_back, next_match, next_match_back], - [InRange(0, 2), InRange(32, 34), InRange(8, 10), Done] + next_match => Matches(0, 2), + next_match_back => Matches(32, 34), + next_match => Matches(8, 10), + next_match_back => Done ); search_asserts!( STRESS, '각', "Reverse double ended search for three-byte Hangul character", - [next_match_back, next_back, next_match, next, next_match_back, next_match], - [InRange(34, 37), Rejects(32, 34), InRange(19, 22), Rejects(22, 25), InRange(28, 31), Done] + next_match_back => Matches(34, 37), + next_back => Rejects(31, 34), + next_match => Matches(19, 22), + next => Rejects(22, 28), + next_match_back => Matches(28, 31), + next_match => Done ); search_asserts!( STRESS, 'ก', "Double ended search for three-byte Thai character", - [next_match, next_back, next, next_match_back, next_match], - [InRange(22, 25), Rejects(47, 48), Rejects(25, 28), InRange(40, 43), Done] + next_match => Matches(22, 25), + next_back => Rejects(43, 48), + next => Rejects(25, 40), + next_match_back => Matches(40, 43), + next_match => Done ); search_asserts!( STRESS, '😁', "Double ended search for four-byte emoji", - [next_match_back, next, next_match, next_back, next_match], - [InRange(43, 47), Rejects(0, 2), InRange(15, 19), Rejects(40, 43), Done] + next_match_back => Matches(43, 47), + next => Rejects(0, 15), + next_match => Matches(15, 19), + next_back => Rejects(19, 43), + next_match => Done ); search_asserts!( STRESS, 'ꁁ', "Double ended search for three-byte Yi character with repeated bytes", - [next_match, next, next_match_back, next_back, next_match], - [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(34, 37), Done] + next_match => Matches(10, 13), + next => Rejects(13, 37), + next_match_back => Matches(37, 40), + next_back => Done ); } diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 5c0541d3caf33..74b95997af8c9 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -7,11 +7,12 @@ use crate::collections::TryReserveError; use crate::fmt; use crate::hash::{Hash, Hasher}; use crate::ops; +use crate::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, SearchStep, Searcher}; use crate::rc::Rc; use crate::str::FromStr; use crate::sync::Arc; -use crate::sys::os_str::{Buf, Slice}; +use crate::sys::os_str::{Buf, BytesFlavour, Slice}; use crate::sys_common::{AsInner, FromInner, IntoInner}; /// A type that can represent owned, mutable platform-native strings, but is @@ -978,6 +979,167 @@ impl OsStr { pub fn eq_ignore_ascii_case>(&self, other: S) -> bool { self.inner.eq_ignore_ascii_case(&other.as_ref().inner) } + + /// Returns `true` if the given pattern matches a prefix of this `OsStr` + /// slice. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").starts_with('f')); + /// assert!(!OsStr::new("foo").starts_with('F')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn starts_with<'a, P: Pattern<&'a OsStr>>(&'a self, pat: P) -> bool { + pat.is_prefix_of(self) + } + + /// Returns `true` if the given pattern matches a suffix of this `OsStr` + /// slice. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert!(OsStr::new("foo").ends_with('o')); + /// assert!(!OsStr::new("foo").ends_with('O')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn ends_with<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + { + pat.is_suffix_of(self) + } + + /// Returns a `OsStr` slice with the prefix removed. + /// + /// If the string starts with the pattern `prefix`, returns substring after + /// the prefix, wrapped in `Some`. If the string doesn’t start with + /// `prefix`, returns `None`. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsStr::new("foo")), OsStr::new("--foo").strip_prefix("--")); + /// assert_eq!(None, OsStr::new("-f").strip_prefix("--")); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_prefix<'a, P>(&'a self, prefix: P) -> Option<&'a OsStr> + where + P: Pattern<&'a OsStr>, + { + prefix.strip_prefix_of(self) + } + + /// Returns a `OsStr` slice with the suffix removed. + /// + /// If the string ends with the pattern `suffix`, returns substring before + /// the suffix, wrapped in `Some`. If the string doesn’t end with `suffix`, + /// returns `None`. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some(OsStr::new("fo")), OsStr::new("foo").strip_suffix('o')); + /// assert_eq!(None, OsStr::new("foo").strip_suffix('O')); + /// ``` + #[must_use = "this returns the remaining substring as a new slice, \ + without modifying the original"] + #[unstable(feature = "pattern", issue = "27721")] + pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a OsStr> + where + P: Pattern<&'a OsStr>, +

>::Searcher: ReverseSearcher<&'a OsStr>, + { + suffix.strip_suffix_of(self) + } + + /// Splits the string on the first occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some((OsStr::new("foo"), OsStr::new("bar=baz"))), + /// OsStr::new("foo=bar=baz").split_once('=')); + /// assert_eq!(None, OsStr::new("foobar").split_once(',')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn split_once<'a, P: Pattern<&'a OsStr>>(&'a self, delimiter: P) -> Option<(&Self, &Self)> { + let (start, end) = delimiter.into_searcher(self).next_match()?; + eprintln!("{:?} {} {}", self, start, end); + Some(unsafe { + let head = self.bytes().get_unchecked(..start); + let tail = self.bytes().get_unchecked(end..); + let head = OsStr::from_inner(core::mem::transmute(head)); + let tail = OsStr::from_inner(core::mem::transmute(tail)); + (head, tail) + }) + } + + /// Splits the string on the last occurrence of the specified delimiter and + /// returns prefix before delimiter and suffix after delimiter. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// assert_eq!(Some((OsStr::new("foo=bar"), OsStr::new("baz"))), + /// OsStr::new("foo=bar=baz").rsplit_once('=')); + /// assert_eq!(None, OsStr::new("foobar").rsplit_once(',')); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn rsplit_once<'a, P: Pattern<&'a OsStr>>(&'a self, delimiter: P) -> Option<(&Self, &Self)> + where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + { + let (start, end) = delimiter.into_searcher(self).next_match_back()?; + Some(unsafe { + let head = self.bytes().get_unchecked(..start); + let tail = self.bytes().get_unchecked(end..); + let head = OsStr::from_inner(core::mem::transmute(head)); + let tail = OsStr::from_inner(core::mem::transmute(tail)); + (head, tail) + }) + } + + /// An iterator over substrings of this string slice, separated by + /// characters matched by a pattern. + /// + /// # Examples + /// + /// ``` + /// # #![feature(pattern)] + /// use std::ffi::OsStr; + /// + /// let value = OsStr::new("foo,bar,baz"); + /// let got = value.split(',').collect::>(); + /// let want = [OsStr::new("foo"), OsStr::new("bar"), OsStr::new("baz")]; + /// assert_eq!(&want[..], &got[..]); + /// ``` + #[unstable(feature = "pattern", issue = "27721")] + pub fn split<'hs, P: Pattern<&'hs OsStr>>(&'hs self, delimiter: P) -> Split<'hs, P::Searcher> { + Split(core::pattern::Split::new(delimiter.into_searcher(self)).with_allow_trailing_empty()) + } } #[stable(feature = "box_from_os_str", since = "1.17.0")] @@ -1445,3 +1607,344 @@ impl<'a> FromIterator> for OsString { } } } + +#[unstable(feature = "str_internals", issue = "none")] +impl<'a> From<&'a OsStr> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(val: &'a OsStr) -> Self { + val.inner.into() + } +} + +#[unstable(feature = "str_internals", issue = "none")] +impl<'a> From> for &'a OsStr { + fn from(val: core::str_bytes::Bytes<'a, BytesFlavour>) -> Self { + OsStr::from_inner(<&Slice>::from(val)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> core::pattern::Haystack for &'hs OsStr { + type Cursor = usize; + + #[inline(always)] + fn cursor_at_front(self) -> usize { + 0 + } + #[inline(always)] + fn cursor_at_back(self) -> usize { + self.inner.inner.len() + } + + #[inline(always)] + fn is_empty(self) -> bool { + self.inner.inner.is_empty() + } + + #[inline(always)] + unsafe fn get_unchecked(self, range: core::ops::Range) -> Self { + // SAFETY: Caller promises that `range` is valid. + OsStr::from_inner(unsafe { self.inner.get_unchecked(range) }) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> core::pattern::Pattern<&'hs OsStr> for char { + type Searcher = CharSearcher<'hs>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.strip_prefix_of(core::str_bytes::Bytes::from(haystack)).map(|bytes| bytes.into()) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.strip_suffix_of(core::str_bytes::Bytes::from(haystack)).map(|bytes| bytes.into()) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct CharSearcher<'hs>(core::str_bytes::CharSearcher<'hs, BytesFlavour>); + +impl<'hs> CharSearcher<'hs> { + fn new(haystack: &'hs OsStr, needle: char) -> CharSearcher<'hs> { + Self(core::str_bytes::CharSearcher::new(haystack.into(), needle)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs> Searcher<&'hs OsStr> for CharSearcher<'hs> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs> ReverseSearcher<&'hs OsStr> for CharSearcher<'hs> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs> DoubleEndedSearcher<&'hs OsStr> for CharSearcher<'hs> {} + +#[unstable(feature = "pattern", issue = "27721")] +// FIXME: Using Predicate because of: +// error[E0210]: type parameter `F` must be covered by another type when it +// appears before the first local type (`OsStr`) +// --> library/std/src/ffi/os_str.rs:1697:11 +// | +// 1697 | impl<'hs, F: FnMut(char) -> bool> core::pattern::Pattern<&'hs OsStr> for F { +// | ^ type parameter `F` must be covered by another type +// when it appears before the first local type (`OsStr`) +impl<'hs, F: FnMut(char) -> bool> core::pattern::Pattern<&'hs OsStr> + for core::pattern::Predicate +{ + type Searcher = PredicateSearcher<'hs, F>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self.into_fn()) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.into_fn().is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.into_fn() + .strip_prefix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| bytes.into()) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.into_fn() + .strip_suffix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| bytes.into()) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct PredicateSearcher<'hs, P>(core::str_bytes::PredicateSearcher<'hs, BytesFlavour, P>); + +impl<'hs, P> PredicateSearcher<'hs, P> { + fn new(haystack: &'hs OsStr, pred: P) -> PredicateSearcher<'hs, P> { + Self(core::str_bytes::PredicateSearcher::new(haystack.into(), pred)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, P: FnMut(char) -> bool> Searcher<&'hs OsStr> for PredicateSearcher<'hs, P> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, P: FnMut(char) -> bool> ReverseSearcher<&'hs OsStr> for PredicateSearcher<'hs, P> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, P: FnMut(char) -> bool> DoubleEndedSearcher<&'hs OsStr> for PredicateSearcher<'hs, P> {} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, 'p> core::pattern::Pattern<&'hs OsStr> for &'p str { + type Searcher = StrSearcher<'hs, 'p>; + + fn into_searcher(self, haystack: &'hs OsStr) -> Self::Searcher { + Self::Searcher::new(haystack, self) + } + + fn is_contained_in(self, haystack: &'hs OsStr) -> bool { + self.is_contained_in(core::str_bytes::Bytes::from(haystack)) + } + + fn is_prefix_of(self, haystack: &'hs OsStr) -> bool { + self.is_prefix_of(core::str_bytes::Bytes::from(haystack)) + } + + fn is_suffix_of(self, haystack: &'hs OsStr) -> bool { + self.is_suffix_of(core::str_bytes::Bytes::from(haystack)) + } + + /// Removes the pattern from the front of haystack, if it matches. + fn strip_prefix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> { + self.strip_prefix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| OsStr::from_inner(unsafe { core::mem::transmute(bytes.as_bytes()) })) + } + + /// Removes the pattern from the back of haystack, if it matches. + fn strip_suffix_of(self, haystack: &'hs OsStr) -> Option<&'hs OsStr> + where + Self::Searcher: ReverseSearcher<&'hs OsStr>, + { + self.strip_suffix_of(core::str_bytes::Bytes::from(haystack)) + .map(|bytes| OsStr::from_inner(unsafe { core::mem::transmute(bytes.as_bytes()) })) + } +} + +#[derive(Clone, Debug)] +#[unstable(feature = "pattern", issue = "27721")] +pub struct StrSearcher<'hs, 'p>(core::str_bytes::StrSearcher<'hs, 'p, BytesFlavour>); + +impl<'hs, 'p> StrSearcher<'hs, 'p> { + fn new(haystack: &'hs OsStr, needle: &'p str) -> StrSearcher<'hs, 'p> { + let haystack = core::str_bytes::Bytes::from(haystack); + Self(core::str_bytes::StrSearcher::new(haystack, needle)) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, 'p> Searcher<&'hs OsStr> for StrSearcher<'hs, 'p> { + #[inline(always)] + fn haystack(&self) -> &'hs OsStr { + self.0.haystack().into() + } + + #[inline(always)] + fn next(&mut self) -> SearchStep { + self.0.next() + } + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + self.0.next_match() + } + #[inline(always)] + fn next_reject(&mut self) -> Option<(usize, usize)> { + self.0.next_reject() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +unsafe impl<'hs, 'p> ReverseSearcher<&'hs OsStr> for StrSearcher<'hs, 'p> { + #[inline(always)] + fn next_back(&mut self) -> SearchStep { + self.0.next_back() + } + #[inline(always)] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + self.0.next_match_back() + } + #[inline(always)] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + self.0.next_reject_back() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +pub struct Split<'hs, S: Searcher<&'hs OsStr>>(core::pattern::Split<&'hs OsStr, S>); + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr> + Clone> Clone for Split<'hs, S> { + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr> + fmt::Debug> fmt::Debug for Split<'hs, S> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(fmt) + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr>> Iterator for Split<'hs, S> { + type Item = &'hs OsStr; + + fn next(&mut self) -> Option { + self.0.next_fwd::() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: DoubleEndedSearcher<&'hs OsStr>> DoubleEndedIterator for Split<'hs, S> { + fn next_back(&mut self) -> Option { + self.0.next_bwd::() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'hs, S: Searcher<&'hs OsStr>> core::iter::FusedIterator for Split<'hs, S> {} diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 318a46d1b637e..db318a50193b4 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -240,6 +240,7 @@ #![feature(allocator_internals)] #![feature(allow_internal_unsafe)] #![feature(allow_internal_unstable)] +#![feature(associated_type_bounds)] #![feature(c_unwind)] #![feature(cfg_target_thread_local)] #![feature(concat_idents)] @@ -298,6 +299,8 @@ #![feature(panic_can_unwind)] #![feature(panic_info_message)] #![feature(panic_internals)] +#![feature(pattern)] +#![feature(pattern_internals)] #![feature(pointer_byte_offsets)] #![feature(pointer_is_aligned)] #![feature(portable_simd)] @@ -491,6 +494,8 @@ pub use core::mem; pub use core::ops; #[stable(feature = "rust1", since = "1.0.0")] pub use core::option; +#[unstable(feature = "pattern", issue = "27721")] +pub use core::pattern; #[stable(feature = "pin", since = "1.33.0")] pub use core::pin; #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 488217f39413f..69184ce445a19 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -19,15 +19,31 @@ mod tests; #[derive(Hash)] #[repr(transparent)] -pub struct Buf { +pub(crate) struct Buf { pub inner: Vec, } #[repr(transparent)] -pub struct Slice { +pub(crate) struct Slice { pub inner: [u8], } +pub(crate) type BytesFlavour = core::str_bytes::Unstructured; + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Slice> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(slice: &'a Slice) -> Self { + (&slice.inner).into() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Slice { + fn from(bytes: core::str_bytes::Bytes<'a, BytesFlavour>) -> Self { + Slice::from_u8_slice(bytes.as_bytes()) + } +} + impl fmt::Debug for Slice { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f) @@ -270,4 +286,10 @@ impl Slice { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.inner.eq_ignore_ascii_case(&other.inner) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + Self::from_u8_slice(unsafe { self.inner.get_unchecked(range) }) + } } diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 2f2b0e56e0889..55c41195b9ef5 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -10,7 +10,7 @@ use crate::sys_common::wtf8::{Wtf8, Wtf8Buf}; use crate::sys_common::{AsInner, FromInner, IntoInner}; #[derive(Clone, Hash)] -pub struct Buf { +pub(crate) struct Buf { pub inner: Wtf8Buf, } @@ -46,10 +46,28 @@ impl fmt::Display for Buf { } #[repr(transparent)] -pub struct Slice { +pub(crate) struct Slice { pub inner: Wtf8, } +pub(crate) type BytesFlavour = core::str_bytes::Wtf8; + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Slice> for core::str_bytes::Bytes<'a, BytesFlavour> { + fn from(slice: &'a Slice) -> Self { + (&slice.inner).into() + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Slice { + fn from(bytes: core::str_bytes::Bytes<'a, BytesFlavour>) -> &'a Slice { + let inner = <&Wtf8>::from(bytes); + // SAFETY: `Slice` is transparent wrapper around `Wtf8`. + unsafe { mem::transmute(inner) } + } +} + impl fmt::Debug for Slice { fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&self.inner, formatter) @@ -222,4 +240,12 @@ impl Slice { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.inner.eq_ignore_ascii_case(&other.inner) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + let inner = unsafe { self.inner.get_unchecked(range) }; + // SAFETY: We’re just a transparent wrapper around `Wtf8`. + unsafe { mem::transmute(inner) } + } } diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index ff96c35fb0ba6..4649fa221286d 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -42,7 +42,7 @@ const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; /// which represents a Unicode scalar value: /// a code point that is not a surrogate (U+D800 to U+DFFF). #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { +pub(crate) struct CodePoint { value: u32, } @@ -133,7 +133,7 @@ impl CodePoint { /// Similar to `String`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] -pub struct Wtf8Buf { +pub(crate) struct Wtf8Buf { bytes: Vec, /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily @@ -496,7 +496,8 @@ impl Extend for Wtf8Buf { /// Similar to `&str`, but can additionally contain surrogate code points /// if they’re not in a surrogate pair. #[derive(Eq, Ord, PartialEq, PartialOrd)] -pub struct Wtf8 { +#[repr(transparent)] +pub(crate) struct Wtf8 { bytes: [u8], } @@ -781,6 +782,14 @@ impl Wtf8 { pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { self.bytes.eq_ignore_ascii_case(&other.bytes) } + + #[inline] + pub(crate) unsafe fn get_unchecked(&self, range: core::ops::Range) -> &Self { + // SAFETY: Caller promises `range` is valid. + let bytes = unsafe { self.bytes.get_unchecked(range) }; + // SAFETY: We’re just a transparent wrapper around [u8]. + unsafe { mem::transmute(bytes) } + } } /// Returns a slice of the given string for the byte range \[`begin`..`end`). @@ -869,7 +878,7 @@ fn decode_surrogate_pair(lead: u16, trail: u16) -> char { /// Copied from core::str::StrPrelude::is_char_boundary #[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { +pub(crate) fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { if index == slice.len() { return true; } @@ -881,14 +890,14 @@ pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { /// Copied from core::str::raw::slice_unchecked #[inline] -pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { +pub(crate) unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { // memory layout of a &[u8] and &Wtf8 are the same Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin)) } /// Copied from core::str::raw::slice_error_fail #[inline(never)] -pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { +pub(crate) fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { assert!(begin <= end); panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); } @@ -897,7 +906,7 @@ pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { /// /// Created with the method `.code_points()`. #[derive(Clone)] -pub struct Wtf8CodePoints<'a> { +pub(crate) struct Wtf8CodePoints<'a> { bytes: slice::Iter<'a, u8>, } @@ -984,3 +993,20 @@ impl Hash for Wtf8 { 0xfeu8.hash(state) } } + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From<&'a Wtf8> for core::str_bytes::Bytes<'a, core::str_bytes::Wtf8> { + fn from(wtf8: &'a Wtf8) -> Self { + // SAFETY: As name implies, `Wtf8`’s bytes ares guaranteed to be WTF-8 + // so `Wtf8` flavour is correct. + unsafe { core::str_bytes::Bytes::new(&wtf8.bytes) } + } +} + +#[unstable(feature = "pattern", issue = "27721")] +impl<'a> From> for &'a Wtf8 { + fn from(bytes: core::str_bytes::Bytes<'a, core::str_bytes::Wtf8>) -> Self { + // SAFETY: Bytes<'_, Wtf8> are guaranteed to be well-formed WTF-8. + unsafe { Wtf8::from_bytes_unchecked(bytes.as_bytes()) } + } +} diff --git a/library/std/tests/os_str.rs b/library/std/tests/os_str.rs new file mode 100644 index 0000000000000..24dd4e1434567 --- /dev/null +++ b/library/std/tests/os_str.rs @@ -0,0 +1,2598 @@ +#![feature(associated_type_bounds, pattern)] + +use core::pattern::{Pattern, Searcher, ReverseSearcher, predicate}; +use std::borrow::Cow; +use std::ffi::{OsStr, OsString}; + +//////////////////////////////////////////////////////////////////////////////// +// Helper functions for creating OsStr and OsString + +/// Cast `str` into `OsStr`. This is a trivial convenience function. +fn os(value: &str) -> &OsStr { + OsStr::new(value) +} + +/// Constructs an OsString with potentially invalid UTF-8. +/// +/// If `valid` is `false`, some characters are replaced by invalid sequences +/// (see `map_invalid`) resulting in returned OsString not being a valid String. +fn make_os_string(value: &str, valid: bool) -> OsString { + if valid { + OsString::from(value) + } else { + make_invalid_os_string(value) + } +} + +fn map_invalid(chr: char) -> Result { + match chr { + 'ą' => Err(0xB1), + 'ä' => Err(0xE4), + 'ă' => Err(0xE3), + 'ó' => Err(0xF3), + chr => Ok(chr), + } +} + +#[cfg(unix)] +fn make_invalid_os_string(value: &str) -> OsString { + use std::os::unix::ffi::OsStringExt; + + let mut vec = Vec::with_capacity(value.len()); + let mut buf = [0; 4]; + for chr in value.chars() { + match map_invalid(chr) { + Ok(chr) => vec.extend_from_slice(chr.encode_utf8(&mut buf).as_bytes()), + Err(byte) => vec.push(byte) + } + } + OsString::from_vec(vec) +} + +#[cfg(windows)] +fn make_invalid_os_string(value: &str) -> OsString { + use std::os::windows::ffi::OsStringExt; + + let mut vec = Vec::with_capacity(value.len()); + let mut buf = [0; 2]; + for chr in value.chars() { + match map_invalid(chr) { + Ok(chr) => vec.extend_from_slice(chr.encode_utf16(&mut buf)), + Err(byte) => vec.push(0xD800 | byte as u16), + } + } + OsStringExt::from_wide(&vec) +} + +//////////////////////////////////////////////////////////////////////////////// +// Test of features demonstrating command-line argument parsing + +fn do_test_long_flag(valid: bool) { + let os = |value| { make_os_string(value, valid) }; + + // strip_prefix("--") and strip_prefix('-') can be used to check if it’s + // a flag argument or not. + let arg = os("--flăg=fóó,bąr,bäz"); + assert_eq!(Some(&*os("-flăg=fóó,bąr,bäz")), arg.strip_prefix('-')); + assert_eq!(Some(&*os("-flăg=fóó,bąr,bäz")), arg.strip_prefix("-")); + assert_eq!(Some(&*os("flăg=fóó,bąr,bäz")), arg.strip_prefix("--")); + + // split_once('=') separates long flag name from its value. If + // split_once returns None, there’s no value with the flag. + let arg = os("flăg=fóó,bąr,bäz"); + assert_eq!( + Some((&*os("flăg"), &*os("fóó,bąr,bäz"))), + arg.split_once('=') + ); + assert_eq!(None, os("flăg").split_once('=')); + + // split(',') separates values in flag whose values are comma separated. + let arg = os("fóó,bąr,bäz"); + let values = arg.split(',').collect::>(); + assert_eq!(&[os("fóó"), os("bąr"), os("bäz")][..], values.as_slice()) +} + +#[test] +fn test_long_flag() { + do_test_long_flag(true) +} + +#[test] +fn test_long_flag_non_utf8() { + do_test_long_flag(false) +} + +fn do_test_short_flag(valid: bool) { + let os = |value| { make_os_string(value, valid) }; + + // strip_prefix("--") and strip_prefix('-') can be used to check if it’s + // a flag argument or not. + let arg = os("-shórt"); + assert_eq!(Some(&*os("shórt")), arg.strip_prefix('-')); + assert_eq!(Some(&*os("shórt")), arg.strip_prefix("-")); + assert_eq!(None, arg.strip_prefix("--")); + + // A bit awkward but closure can be used to test short options character + // by character. + let mut switch = '\0'; + let mut check_switch = |chr| { + switch = chr; + chr == 's' || chr == 'h' + }; + assert_eq!( + Some(&*os("hórt")), + os("shórt").strip_prefix(predicate(&mut check_switch)) + ); + assert_eq!( + Some(&*os("órt")), + os("hórt").strip_prefix(predicate(&mut check_switch)) + ); + assert_eq!(None, os("órt").strip_prefix(predicate(&mut check_switch))); +} + +#[test] +fn test_short_flag() { + do_test_short_flag(true) +} + +#[test] +fn test_short_flag_non_utf8() { + do_test_short_flag(false) +} + +//////////////////////////////////////////////////////////////////////////////// +// Test adapted from library/alloc/tests/str.rs + +// We currently don’t offer full set of pattern-matching methods on OsStr which +// are available on str. At least some of them can be implemented using Pattern +// API so do that for the sake of testing. + +fn find<'a>(haystack: &'a str, pat: impl Pattern<&'a OsStr>) -> Option { + pat.into_searcher(os(haystack)).next_match().map(|(i, _)| i) +} + +fn rfind<'a, P>(haystack: &'a str, pat: P) -> Option +where + P: Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, +{ + pat.into_searcher(os(haystack)).next_match_back().map(|(i, _)| i) +} + +pub fn contains<'a>(haystack: &'a str, pat: impl Pattern<&'a OsStr>) -> bool { + pat.is_contained_in(os(haystack)) +} + + +#[test] +fn test_le() { + assert!(os("") <= ""); + assert!(os("") <= "foo"); + assert!(os("foo") <= "foo"); + assert_ne!(os("foo"), "bar"); +} + +#[test] +fn test_find() { + assert_eq!(find("hello", 'l'), Some(2)); + assert_eq!(find("hello", predicate(|c: char| c == 'o')), Some(4)); + assert!(find("hello", 'x').is_none()); + assert!(find("hello", predicate(|c: char| c == 'x')).is_none()); + assert_eq!(find("ประเทศไทย中华Việt Nam", '华'), Some(30)); + assert_eq!(find("ประเทศไทย中华Việt Nam", predicate(|c: char| c == '华')), Some(30)); +} + +#[test] +fn test_rfind() { + assert_eq!(rfind("hello", 'l'), Some(3)); + assert_eq!(rfind("hello", predicate(|c: char| c == 'o')), Some(4)); + assert!(rfind("hello", 'x').is_none()); + assert!(rfind("hello", predicate(|c: char| c == 'x')).is_none()); + assert_eq!(rfind("ประเทศไทย中华Việt Nam", '华'), Some(30)); + assert_eq!(rfind("ประเทศไทย中华Việt Nam", predicate(|c: char| c == '华')), Some(30)); +} + +/* +#[test] +fn test_collect() { + let empty = ""; + let s: String = empty.chars().collect(); + assert_eq!(empty, s); + let data = "ประเทศไทย中"; + let s: String = data.chars().collect(); + assert_eq!(data, s); +} +*/ + +#[test] +fn test_find_str() { + // byte positions + assert_eq!(find("", ""), Some(0)); + assert!(find("banana", "apple pie").is_none()); + + assert_eq!(find("abcabc", "ab"), Some(0)); + assert_eq!(find("cabc", "ab"), Some(1)); + assert!(find("ca", "ab").is_none()); + + let string = "ประเทศไทย中华Việt Nam"; + let mut data = String::from(string); + data.push_str(string); + assert!(find(&data, "ไท华").is_none()); + assert_eq!(find(&data[0..43], ""), Some(0)); + assert_eq!(find(&data[6..43], ""), Some(6 - 6)); + + assert_eq!(find(&data[0..43], "ประ"), Some(0)); + assert_eq!(find(&data[0..43], "ทศไ"), Some(12)); + assert_eq!(find(&data[0..43], "ย中"), Some(24)); + assert_eq!(find(&data[0..43], "iệt"), Some(34)); + assert_eq!(find(&data[0..43], "Nam"), Some(40)); + + assert_eq!(find(&data[43..86], "ประ"), Some(43 - 43)); + assert_eq!(find(&data[43..86], "ทศไ"), Some(55 - 43)); + assert_eq!(find(&data[43..86], "ย中"), Some(67 - 43)); + assert_eq!(find(&data[43..86], "iệt"), Some(77 - 43)); + assert_eq!(find(&data[43..86], "Nam"), Some(83 - 43)); + + // find every substring -- assert that it finds it, or an earlier occurrence. + let string = "Việt Namacbaabcaabaaba"; + for (i, ci) in string.char_indices() { + let ip = i + ci.len_utf8(); + for j in string[ip..].char_indices().map(|(i, _)| i).chain(Some(string.len() - ip)) { + let pat = &string[i..ip + j]; + assert!(match find(&string, pat) { + None => false, + Some(x) => x <= i, + }); + assert!(match rfind(&string, pat) { + None => false, + Some(x) => x >= i, + }); + } + } +} + +/* +fn s(x: &str) -> String { + x.to_string() +} + +macro_rules! test_concat { + ($expected: expr, $string: expr) => {{ + let s: String = $string.concat(); + assert_eq!($expected, s); + }}; +} + +#[test] +fn test_concat_for_different_types() { + test_concat!("ab", vec![s("a"), s("b")]); + test_concat!("ab", vec!["a", "b"]); +} + +#[test] +fn test_concat_for_different_lengths() { + let empty: &[&str] = &[]; + test_concat!("", empty); + test_concat!("a", ["a"]); + test_concat!("ab", ["a", "b"]); + test_concat!("abc", ["", "a", "bc"]); +} + */ + +/* +macro_rules! test_join { + ($expected: expr, $string: expr, $delim: expr) => {{ + let s = $string.join($delim); + assert_eq!($expected, s); + }}; +} + +#[test] +fn test_join_for_different_types() { + test_join!("a-b", ["a", "b"], "-"); + let hyphen = "-".to_string(); + test_join!("a-b", [s("a"), s("b")], &*hyphen); + test_join!("a-b", vec!["a", "b"], &*hyphen); + test_join!("a-b", &*vec!["a", "b"], "-"); + test_join!("a-b", vec![s("a"), s("b")], "-"); +} + +#[test] +fn test_join_for_different_lengths() { + let empty: &[&str] = &[]; + test_join!("", empty, "-"); + test_join!("a", ["a"], "-"); + test_join!("a-b", ["a", "b"], "-"); + test_join!("-a-bc", ["", "a", "bc"], "-"); +} + +// join has fast paths for small separators up to 4 bytes +// this tests the slow paths. +#[test] +fn test_join_for_different_lengths_with_long_separator() { + assert_eq!("~~~~~".len(), 15); + + let empty: &[&str] = &[]; + test_join!("", empty, "~~~~~"); + test_join!("a", ["a"], "~~~~~"); + test_join!("a~~~~~b", ["a", "b"], "~~~~~"); + test_join!("~~~~~a~~~~~bc", ["", "a", "bc"], "~~~~~"); +} + +#[test] +fn test_join_issue_80335() { + use core::{borrow::Borrow, cell::Cell}; + + struct WeirdBorrow { + state: Cell, + } + + impl Default for WeirdBorrow { + fn default() -> Self { + WeirdBorrow { state: Cell::new(false) } + } + } + + impl Borrow for WeirdBorrow { + fn borrow(&self) -> &str { + let state = self.state.get(); + if state { + "0" + } else { + self.state.set(true); + "123456" + } + } + } + + let arr: [WeirdBorrow; 3] = Default::default(); + test_join!("0-0-0", arr, "-"); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_unsafe_slice() { + assert_eq!("ab", unsafe { "abc".get_unchecked(0..2) }); + assert_eq!("bc", unsafe { "abc".get_unchecked(1..3) }); + assert_eq!("", unsafe { "abc".get_unchecked(1..1) }); + fn a_million_letter_a() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("aaaaaaaaaa"); + i += 1; + } + rs + } + fn half_a_million_letter_a() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("aaaaa"); + i += 1; + } + rs + } + let letters = a_million_letter_a(); + assert_eq!(half_a_million_letter_a(), unsafe { letters.get_unchecked(0..500000) }); +} +*/ + +#[test] +fn test_starts_with() { + assert!(os("").starts_with("")); + assert!(os("abc").starts_with("")); + assert!(os("abc").starts_with("a")); + assert!(!os("a").starts_with("abc")); + assert!(!os("").starts_with("abc")); + assert!(!os("ödd").starts_with("-")); + assert!(os("ödd").starts_with("öd")); +} + +#[test] +fn test_ends_with() { + assert!(os("").ends_with("")); + assert!(os("abc").ends_with("")); + assert!(os("abc").ends_with("c")); + assert!(!os("a").ends_with("abc")); + assert!(!os("").ends_with("abc")); + assert!(!os("ddö").ends_with("-")); + assert!(os("ddö").ends_with("dö")); +} + +#[test] +fn test_is_empty() { + assert!(os("").is_empty()); + assert!(!os("a").is_empty()); +} + +/* +#[test] +fn test_replacen() { + assert_eq!("".replacen('a', "b", 5), ""); + assert_eq!("acaaa".replacen("a", "b", 3), "bcbba"); + assert_eq!("aaaa".replacen("a", "b", 0), "aaaa"); + + let test = "test"; + assert_eq!(" test test ".replacen(test, "toast", 3), " toast toast "); + assert_eq!(" test test ".replacen(test, "toast", 0), " test test "); + assert_eq!(" test test ".replacen(test, "", 5), " "); + + assert_eq!("qwer123zxc789".replacen(char::is_numeric, "", 3), "qwerzxc789"); +} + +#[test] +fn test_replace() { + let a = "a"; + assert_eq!("".replace(a, "b"), ""); + assert_eq!("a".replace(a, "b"), "b"); + assert_eq!("ab".replace(a, "b"), "bb"); + let test = "test"; + assert_eq!(" test test ".replace(test, "toast"), " toast toast "); + assert_eq!(" test test ".replace(test, ""), " "); +} + +#[test] +fn test_replace_2a() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let a = "ประเ"; + let a2 = "دولة الكويتทศไทย中华"; + assert_eq!(data.replace(a, repl), a2); +} + +#[test] +fn test_replace_2b() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let b = "ะเ"; + let b2 = "ปรدولة الكويتทศไทย中华"; + assert_eq!(data.replace(b, repl), b2); +} + +#[test] +fn test_replace_2c() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let c = "中华"; + let c2 = "ประเทศไทยدولة الكويت"; + assert_eq!(data.replace(c, repl), c2); +} + +#[test] +fn test_replace_2d() { + let data = "ประเทศไทย中华"; + let repl = "دولة الكويت"; + + let d = "ไท华"; + assert_eq!(data.replace(d, repl), data); +} + +#[test] +fn test_replace_pattern() { + let data = "abcdαβγδabcdαβγδ"; + assert_eq!(data.replace("dαβ", "😺😺😺"), "abc😺😺😺γδabc😺😺😺γδ"); + assert_eq!(data.replace('γ', "😺😺😺"), "abcdαβ😺😺😺δabcdαβ😺😺😺δ"); + assert_eq!(data.replace(&['a', 'γ'] as &[_], "😺😺😺"), "😺😺😺bcdαβ😺😺😺δ😺😺😺bcdαβ😺😺😺δ"); + assert_eq!(data.replace(|c| c == 'γ', "😺😺😺"), "abcdαβ😺😺😺δabcdαβ😺😺😺δ"); +} + +// The current implementation of SliceIndex fails to handle methods +// orthogonally from range types; therefore, it is worth testing +// all of the indexing operations on each input. +mod slice_index { + // Test a slicing operation **that should succeed,** + // testing it on all of the indexing methods. + // + // This is not suitable for testing failure on invalid inputs. + macro_rules! assert_range_eq { + ($s:expr, $range:expr, $expected:expr) => { + let mut s: String = $s.to_owned(); + let mut expected: String = $expected.to_owned(); + { + let s: &str = &s; + let expected: &str = &expected; + + assert_eq!(&s[$range], expected, "(in assertion for: index)"); + assert_eq!(s.get($range), Some(expected), "(in assertion for: get)"); + unsafe { + assert_eq!( + s.get_unchecked($range), + expected, + "(in assertion for: get_unchecked)", + ); + } + } + { + let s: &mut str = &mut s; + let expected: &mut str = &mut expected; + + assert_eq!(&mut s[$range], expected, "(in assertion for: index_mut)",); + assert_eq!( + s.get_mut($range), + Some(&mut expected[..]), + "(in assertion for: get_mut)", + ); + unsafe { + assert_eq!( + s.get_unchecked_mut($range), + expected, + "(in assertion for: get_unchecked_mut)", + ); + } + } + }; + } + + // Make sure the macro can actually detect bugs, + // because if it can't, then what are we even doing here? + // + // (Be aware this only demonstrates the ability to detect bugs + // in the FIRST method that panics, as the macro is not designed + // to be used in `should_panic`) + #[test] + #[should_panic(expected = "out of bounds")] + fn assert_range_eq_can_fail_by_panic() { + assert_range_eq!("abc", 0..5, "abc"); + } + + // (Be aware this only demonstrates the ability to detect bugs + // in the FIRST method it calls, as the macro is not designed + // to be used in `should_panic`) + #[test] + #[should_panic(expected = "==")] + fn assert_range_eq_can_fail_by_inequality() { + assert_range_eq!("abc", 0..2, "abc"); + } + + // Generates test cases for bad index operations. + // + // This generates `should_panic` test cases for Index/IndexMut + // and `None` test cases for get/get_mut. + macro_rules! panic_cases { + ($( + in mod $case_name:ident { + data: $data:expr; + + // optional: + // + // a similar input for which DATA[input] succeeds, and the corresponding + // output str. This helps validate "critical points" where an input range + // straddles the boundary between valid and invalid. + // (such as the input `len..len`, which is just barely valid) + $( + good: data[$good:expr] == $output:expr; + )* + + bad: data[$bad:expr]; + message: $expect_msg:expr; // must be a literal + } + )*) => {$( + mod $case_name { + #[test] + fn pass() { + let mut v: String = $data.into(); + + $( assert_range_eq!(v, $good, $output); )* + + { + let v: &str = &v; + assert_eq!(v.get($bad), None, "(in None assertion for get)"); + } + + { + let v: &mut str = &mut v; + assert_eq!(v.get_mut($bad), None, "(in None assertion for get_mut)"); + } + } + + #[test] + #[should_panic(expected = $expect_msg)] + fn index_fail() { + let v: String = $data.into(); + let v: &str = &v; + let _v = &v[$bad]; + } + + #[test] + #[should_panic(expected = $expect_msg)] + fn index_mut_fail() { + let mut v: String = $data.into(); + let v: &mut str = &mut v; + let _v = &mut v[$bad]; + } + } + )*}; + } + + #[test] + fn simple_ascii() { + assert_range_eq!("abc", .., "abc"); + + assert_range_eq!("abc", 0..2, "ab"); + assert_range_eq!("abc", 0..=1, "ab"); + assert_range_eq!("abc", ..2, "ab"); + assert_range_eq!("abc", ..=1, "ab"); + + assert_range_eq!("abc", 1..3, "bc"); + assert_range_eq!("abc", 1..=2, "bc"); + assert_range_eq!("abc", 1..1, ""); + assert_range_eq!("abc", 1..=0, ""); + } + + #[test] + fn simple_unicode() { + // 日本 + assert_range_eq!("\u{65e5}\u{672c}", .., "\u{65e5}\u{672c}"); + + assert_range_eq!("\u{65e5}\u{672c}", 0..3, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", 0..=2, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", ..3, "\u{65e5}"); + assert_range_eq!("\u{65e5}\u{672c}", ..=2, "\u{65e5}"); + + assert_range_eq!("\u{65e5}\u{672c}", 3..6, "\u{672c}"); + assert_range_eq!("\u{65e5}\u{672c}", 3..=5, "\u{672c}"); + assert_range_eq!("\u{65e5}\u{672c}", 3.., "\u{672c}"); + + let data = "ประเทศไทย中华"; + assert_range_eq!(data, 0..3, "ป"); + assert_range_eq!(data, 3..6, "ร"); + assert_range_eq!(data, 3..3, ""); + assert_range_eq!(data, 30..33, "华"); + + /*0: 中 + 3: 华 + 6: V + 7: i + 8: ệ + 11: t + 12: + 13: N + 14: a + 15: m */ + let ss = "中华Việt Nam"; + assert_range_eq!(ss, 3..6, "华"); + assert_range_eq!(ss, 6..16, "Việt Nam"); + assert_range_eq!(ss, 6..=15, "Việt Nam"); + assert_range_eq!(ss, 6.., "Việt Nam"); + + assert_range_eq!(ss, 0..3, "中"); + assert_range_eq!(ss, 3..7, "华V"); + assert_range_eq!(ss, 3..=6, "华V"); + assert_range_eq!(ss, 3..3, ""); + assert_range_eq!(ss, 3..=2, ""); + } + + #[test] + #[cfg_attr(target_os = "emscripten", ignore)] // hits an OOM + #[cfg_attr(miri, ignore)] // Miri is too slow + fn simple_big() { + fn a_million_letter_x() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("华华华华华华华华华华"); + i += 1; + } + rs + } + fn half_a_million_letter_x() -> String { + let mut i = 0; + let mut rs = String::new(); + while i < 100000 { + rs.push_str("华华华华华"); + i += 1; + } + rs + } + let letters = a_million_letter_x(); + assert_range_eq!(letters, 0..3 * 500000, half_a_million_letter_x()); + } + + #[test] + #[should_panic] + fn test_slice_fail() { + let _ = &"中华Việt Nam"[0..2]; + } + + panic_cases! { + in mod rangefrom_len { + data: "abcdef"; + good: data[6..] == ""; + bad: data[7..]; + message: "out of bounds"; + } + + in mod rangeto_len { + data: "abcdef"; + good: data[..6] == "abcdef"; + bad: data[..7]; + message: "out of bounds"; + } + + in mod rangetoinclusive_len { + data: "abcdef"; + good: data[..=5] == "abcdef"; + bad: data[..=6]; + message: "out of bounds"; + } + + in mod rangeinclusive_len { + data: "abcdef"; + good: data[0..=5] == "abcdef"; + bad: data[0..=6]; + message: "out of bounds"; + } + + in mod range_len_len { + data: "abcdef"; + good: data[6..6] == ""; + bad: data[7..7]; + message: "out of bounds"; + } + + in mod rangeinclusive_len_len { + data: "abcdef"; + good: data[6..=5] == ""; + bad: data[7..=6]; + message: "out of bounds"; + } + } + + panic_cases! { + in mod rangeinclusive_exhausted { + data: "abcdef"; + + good: data[0..=5] == "abcdef"; + good: data[{ + let mut iter = 0..=5; + iter.by_ref().count(); // exhaust it + iter + }] == ""; + + // 0..=6 is out of bounds before exhaustion, so it + // stands to reason that it still would be after. + bad: data[{ + let mut iter = 0..=6; + iter.by_ref().count(); // exhaust it + iter + }]; + message: "out of bounds"; + } + } + + panic_cases! { + in mod range_neg_width { + data: "abcdef"; + good: data[4..4] == ""; + bad: data[4..3]; + message: "begin <= end (4 <= 3)"; + } + + in mod rangeinclusive_neg_width { + data: "abcdef"; + good: data[4..=3] == ""; + bad: data[4..=2]; + message: "begin <= end (4 <= 3)"; + } + } + + mod overflow { + panic_cases! { + in mod rangeinclusive { + data: "hello"; + // note: using 0 specifically ensures that the result of overflowing is 0..0, + // so that `get` doesn't simply return None for the wrong reason. + bad: data[0..=usize::MAX]; + message: "maximum usize"; + } + + in mod rangetoinclusive { + data: "hello"; + bad: data[..=usize::MAX]; + message: "maximum usize"; + } + } + } + + mod boundary { + const DATA: &str = "abcαβγ"; + + const BAD_START: usize = 4; + const GOOD_START: usize = 3; + const BAD_END: usize = 6; + const GOOD_END: usize = 7; + const BAD_END_INCL: usize = BAD_END - 1; + const GOOD_END_INCL: usize = GOOD_END - 1; + + // it is especially important to test all of the different range types here + // because some of the logic may be duplicated as part of micro-optimizations + // to dodge unicode boundary checks on half-ranges. + panic_cases! { + in mod range_1 { + data: super::DATA; + bad: data[super::BAD_START..super::GOOD_END]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod range_2 { + data: super::DATA; + bad: data[super::GOOD_START..super::BAD_END]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangefrom { + data: super::DATA; + bad: data[super::BAD_START..]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod rangeto { + data: super::DATA; + bad: data[..super::BAD_END]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangeinclusive_1 { + data: super::DATA; + bad: data[super::BAD_START..=super::GOOD_END_INCL]; + message: + "byte index 4 is not a char boundary; it is inside 'α' (bytes 3..5) of"; + } + + in mod rangeinclusive_2 { + data: super::DATA; + bad: data[super::GOOD_START..=super::BAD_END_INCL]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + + in mod rangetoinclusive { + data: super::DATA; + bad: data[..=super::BAD_END_INCL]; + message: + "byte index 6 is not a char boundary; it is inside 'β' (bytes 5..7) of"; + } + } + } + + const LOREM_PARAGRAPH: &str = "\ + Lorem ipsum dolor sit amet, consectetur adipiscing elit. Suspendisse quis lorem \ + sit amet dolor ultricies condimentum. Praesent iaculis purus elit, ac malesuada \ + quam malesuada in. Duis sed orci eros. Suspendisse sit amet magna mollis, mollis \ + nunc luctus, imperdiet mi. Integer fringilla non sem ut lacinia. Fusce varius \ + tortor a risus porttitor hendrerit. Morbi mauris dui, ultricies nec tempus vel, \ + gravida nec quam."; + + // check the panic includes the prefix of the sliced string + #[test] + #[should_panic(expected = "byte index 1024 is out of bounds of `Lorem ipsum dolor sit amet")] + fn test_slice_fail_truncated_1() { + let _ = &LOREM_PARAGRAPH[..1024]; + } + // check the truncation in the panic message + #[test] + #[should_panic(expected = "luctus, im`[...]")] + fn test_slice_fail_truncated_2() { + let _ = &LOREM_PARAGRAPH[..1024]; + } +} + +#[test] +fn test_str_slice_rangetoinclusive_ok() { + let s = "abcαβγ"; + assert_eq!(&s[..=2], "abc"); + assert_eq!(&s[..=4], "abcα"); +} + +#[test] +#[should_panic] +fn test_str_slice_rangetoinclusive_notok() { + let s = "abcαβγ"; + let _ = &s[..=3]; +} + +#[test] +fn test_str_slicemut_rangetoinclusive_ok() { + let mut s = "abcαβγ".to_owned(); + let s: &mut str = &mut s; + assert_eq!(&mut s[..=2], "abc"); + assert_eq!(&mut s[..=4], "abcα"); +} + +#[test] +#[should_panic] +fn test_str_slicemut_rangetoinclusive_notok() { + let mut s = "abcαβγ".to_owned(); + let s: &mut str = &mut s; + let _ = &mut s[..=3]; +} + +#[test] +fn test_is_char_boundary() { + let s = "ศไทย中华Việt Nam β-release 🐱123"; + assert!(s.is_char_boundary(0)); + assert!(s.is_char_boundary(s.len())); + assert!(!s.is_char_boundary(s.len() + 1)); + for (i, ch) in s.char_indices() { + // ensure character locations are boundaries and continuation bytes are not + assert!(s.is_char_boundary(i), "{} is a char boundary in {:?}", i, s); + for j in 1..ch.len_utf8() { + assert!( + !s.is_char_boundary(i + j), + "{} should not be a char boundary in {:?}", + i + j, + s + ); + } + } +} + +#[test] +fn test_trim_start_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_start_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_start_matches(chars), "foo *** "); + assert_eq!(" *** *** ".trim_start_matches(chars), ""); + assert_eq!("foo *** ".trim_start_matches(chars), "foo *** "); + + assert_eq!("11foo1bar11".trim_start_matches('1'), "foo1bar11"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_start_matches(chars), "foo1bar12"); + assert_eq!("123foo1bar123".trim_start_matches(|c: char| c.is_numeric()), "foo1bar123"); +} + +#[test] +fn test_trim_end_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_end_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_end_matches(chars), " *** foo"); + assert_eq!(" *** *** ".trim_end_matches(chars), ""); + assert_eq!(" *** foo".trim_end_matches(chars), " *** foo"); + + assert_eq!("11foo1bar11".trim_end_matches('1'), "11foo1bar"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_end_matches(chars), "12foo1bar"); + assert_eq!("123foo1bar123".trim_end_matches(|c: char| c.is_numeric()), "123foo1bar"); +} + +#[test] +fn test_trim_matches() { + let v: &[char] = &[]; + assert_eq!(" *** foo *** ".trim_matches(v), " *** foo *** "); + let chars: &[char] = &['*', ' ']; + assert_eq!(" *** foo *** ".trim_matches(chars), "foo"); + assert_eq!(" *** *** ".trim_matches(chars), ""); + assert_eq!("foo".trim_matches(chars), "foo"); + + assert_eq!("11foo1bar11".trim_matches('1'), "foo1bar"); + let chars: &[char] = &['1', '2']; + assert_eq!("12foo1bar12".trim_matches(chars), "foo1bar"); + assert_eq!("123foo1bar123".trim_matches(|c: char| c.is_numeric()), "foo1bar"); +} + +#[test] +fn test_trim_start() { + assert_eq!("".trim_start(), ""); + assert_eq!("a".trim_start(), "a"); + assert_eq!(" ".trim_start(), ""); + assert_eq!(" blah".trim_start(), "blah"); + assert_eq!(" \u{3000} wut".trim_start(), "wut"); + assert_eq!("hey ".trim_start(), "hey "); +} + +#[test] +fn test_trim_end() { + assert_eq!("".trim_end(), ""); + assert_eq!("a".trim_end(), "a"); + assert_eq!(" ".trim_end(), ""); + assert_eq!("blah ".trim_end(), "blah"); + assert_eq!("wut \u{3000} ".trim_end(), "wut"); + assert_eq!(" hey".trim_end(), " hey"); +} + +#[test] +fn test_trim() { + assert_eq!("".trim(), ""); + assert_eq!("a".trim(), "a"); + assert_eq!(" ".trim(), ""); + assert_eq!(" blah ".trim(), "blah"); + assert_eq!("\nwut \u{3000} ".trim(), "wut"); + assert_eq!(" hey dude ".trim(), "hey dude"); +} + +#[test] +fn test_is_whitespace() { + assert!("".chars().all(|c| c.is_whitespace())); + assert!(" ".chars().all(|c| c.is_whitespace())); + assert!("\u{2009}".chars().all(|c| c.is_whitespace())); // Thin space + assert!(" \n\t ".chars().all(|c| c.is_whitespace())); + assert!(!" _ ".chars().all(|c| c.is_whitespace())); +} + +#[test] +fn test_is_utf8() { + // deny overlong encodings + assert!(from_utf8(&[0xc0, 0x80]).is_err()); + assert!(from_utf8(&[0xc0, 0xae]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); +} + +#[test] +fn test_const_is_utf8() { + const _: () = { + // deny overlong encodings + assert!(from_utf8(&[0xc0, 0x80]).is_err()); + assert!(from_utf8(&[0xc0, 0xae]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); + }; +} + +#[test] +fn from_utf8_mostly_ascii() { + // deny invalid bytes embedded in long stretches of ascii + for i in 32..64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + } +} + +#[test] +fn const_from_utf8_mostly_ascii() { + const _: () = { + // deny invalid bytes embedded in long stretches of ascii + let mut i = 32; + while i < 64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + + i = i + 1; + } + }; +} + +#[test] +fn from_utf8_error() { + macro_rules! test { + ($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => { + let error = from_utf8($input).unwrap_err(); + assert_matches!(error.valid_up_to(), $expected_valid_up_to); + assert_matches!(error.error_len(), $expected_error_len); + + const _: () = { + match from_utf8($input) { + Err(error) => { + let valid_up_to = error.valid_up_to(); + let error_len = error.error_len(); + + assert!(matches!(valid_up_to, $expected_valid_up_to)); + assert!(matches!(error_len, $expected_error_len)); + } + Ok(_) => unreachable!(), + } + }; + }; + } + test!(b"A\xC3\xA9 \xFF ", 4, Some(1)); + test!(b"A\xC3\xA9 \x80 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC1", 4, Some(1)); + test!(b"A\xC3\xA9 \xC2", 4, None); + test!(b"A\xC3\xA9 \xC2 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1)); + test!(b"A\xC3\xA9 \xE0", 4, None); + test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1)); + test!(b"A\xC3\xA9 \xE0\xA0", 4, None); + test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2)); + test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xF1", 4, None); + test!(b"A\xC3\xA9 \xF1\x80", 4, None); + test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None); + test!(b"A\xC3\xA9 \xF1 ", 4, Some(1)); + test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2)); + test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3)); +} + +#[test] +fn test_as_bytes() { + // no null + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let b: &[u8] = &[]; + assert_eq!("".as_bytes(), b); + assert_eq!("abc".as_bytes(), b"abc"); + assert_eq!("ศไทย中华Việt Nam".as_bytes(), v); +} + +#[test] +#[should_panic] +fn test_as_bytes_fail() { + // Don't double free. (I'm not sure if this exercises the + // original problem code path anymore.) + let s = String::from(""); + let _bytes = s.as_bytes(); + panic!(); +} + +#[test] +fn test_as_ptr() { + let buf = "hello".as_ptr(); + unsafe { + assert_eq!(*buf.add(0), b'h'); + assert_eq!(*buf.add(1), b'e'); + assert_eq!(*buf.add(2), b'l'); + assert_eq!(*buf.add(3), b'l'); + assert_eq!(*buf.add(4), b'o'); + } +} + +#[test] +fn vec_str_conversions() { + let s1: String = String::from("All mimsy were the borogoves"); + + let v: Vec = s1.as_bytes().to_vec(); + let s2: String = String::from(from_utf8(&v).unwrap()); + let mut i = 0; + let n1 = s1.len(); + let n2 = v.len(); + assert_eq!(n1, n2); + while i < n1 { + let a: u8 = s1.as_bytes()[i]; + let b: u8 = s2.as_bytes()[i]; + assert_eq!(a, b); + i += 1; + } +} + */ + +#[test] +fn test_contains() { + assert!(contains("abcde", "bcd")); + assert!(contains("abcde", "abcd")); + assert!(contains("abcde", "bcde")); + assert!(contains("abcde", "")); + assert!(contains("", "")); + assert!(!contains("abcde", "def")); + assert!(!contains("", "a")); + + let data = "ประเทศไทย中华Việt Nam"; + assert!(contains(data, "ประเ")); + assert!(contains(data, "ะเ")); + assert!(contains(data, "中华")); + assert!(!contains(data, "ไท华")); +} + +#[test] +fn test_contains_char() { + assert!(contains("abc", 'b')); + assert!(contains("a", 'a')); + assert!(!contains("abc", 'd')); + assert!(!contains("", 'a')); +} + +/* +#[test] +fn test_split_at() { + let s = "ศไทย中华Việt Nam"; + for (index, _) in s.char_indices() { + let (a, b) = s.split_at(index); + assert_eq!(&s[..a.len()], a); + assert_eq!(&s[a.len()..], b); + } + let (a, b) = s.split_at(s.len()); + assert_eq!(a, s); + assert_eq!(b, ""); +} + +#[test] +fn test_split_at_mut() { + let mut s = "Hello World".to_string(); + { + let (a, b) = s.split_at_mut(5); + a.make_ascii_uppercase(); + b.make_ascii_lowercase(); + } + assert_eq!(s, "HELLO world"); +} + +#[test] +#[should_panic] +fn test_split_at_boundscheck() { + let s = "ศไทย中华Việt Nam"; + let _ = s.split_at(1); +} + +#[test] +fn test_escape_unicode() { + assert_eq!("abc".escape_unicode().to_string(), "\\u{61}\\u{62}\\u{63}"); + assert_eq!("a c".escape_unicode().to_string(), "\\u{61}\\u{20}\\u{63}"); + assert_eq!("\r\n\t".escape_unicode().to_string(), "\\u{d}\\u{a}\\u{9}"); + assert_eq!("'\"\\".escape_unicode().to_string(), "\\u{27}\\u{22}\\u{5c}"); + assert_eq!("\x00\x01\u{fe}\u{ff}".escape_unicode().to_string(), "\\u{0}\\u{1}\\u{fe}\\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_unicode().to_string(), "\\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_unicode().to_string(), "\\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{fb00}".escape_unicode().to_string(), "\\u{61}\\u{62}\\u{fb00}"); + assert_eq!("\u{1d4ea}\r".escape_unicode().to_string(), "\\u{1d4ea}\\u{d}"); +} + +#[test] +fn test_escape_debug() { + // Note that there are subtleties with the number of backslashes + // on the left- and right-hand sides. In particular, Unicode code points + // are usually escaped with two backslashes on the right-hand side, as + // they are escaped. However, when the character is unescaped (e.g., for + // printable characters), only a single backslash appears (as the character + // itself appears in the debug string). + assert_eq!("abc".escape_debug().to_string(), "abc"); + assert_eq!("a c".escape_debug().to_string(), "a c"); + assert_eq!("éèê".escape_debug().to_string(), "éèê"); + assert_eq!("\0\r\n\t".escape_debug().to_string(), "\\0\\r\\n\\t"); + assert_eq!("'\"\\".escape_debug().to_string(), "\\'\\\"\\\\"); + assert_eq!("\u{7f}\u{ff}".escape_debug().to_string(), "\\u{7f}\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_debug().to_string(), "\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_debug().to_string(), "\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{200b}".escape_debug().to_string(), "ab\\u{200b}"); + assert_eq!("\u{10d4ea}\r".escape_debug().to_string(), "\\u{10d4ea}\\r"); + assert_eq!( + "\u{301}a\u{301}bé\u{e000}".escape_debug().to_string(), + "\\u{301}a\u{301}bé\\u{e000}" + ); +} + +#[test] +fn test_escape_default() { + assert_eq!("abc".escape_default().to_string(), "abc"); + assert_eq!("a c".escape_default().to_string(), "a c"); + assert_eq!("éèê".escape_default().to_string(), "\\u{e9}\\u{e8}\\u{ea}"); + assert_eq!("\r\n\t".escape_default().to_string(), "\\r\\n\\t"); + assert_eq!("'\"\\".escape_default().to_string(), "\\'\\\"\\\\"); + assert_eq!("\u{7f}\u{ff}".escape_default().to_string(), "\\u{7f}\\u{ff}"); + assert_eq!("\u{100}\u{ffff}".escape_default().to_string(), "\\u{100}\\u{ffff}"); + assert_eq!("\u{10000}\u{10ffff}".escape_default().to_string(), "\\u{10000}\\u{10ffff}"); + assert_eq!("ab\u{200b}".escape_default().to_string(), "ab\\u{200b}"); + assert_eq!("\u{10d4ea}\r".escape_default().to_string(), "\\u{10d4ea}\\r"); +} + */ + +#[test] +fn test_total_ord() { + fn test(lhs: &str, rhs: &str) { + assert_eq!(lhs.cmp(rhs), os(lhs).cmp(os(rhs)), "{lhs} <=> {rhs}"); + } + + test("1234", "123"); + test("123", "1234"); + test("1234", "1234"); + test("12345555", "123456"); + test("22", "1234"); +} + +/* +#[test] +fn test_iterator() { + let s = "ศไทย中华Việt Nam"; + let v = ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']; + + let mut pos = 0; + let it = s.chars(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(s.chars().count(), v.len()); +} + +#[test] +fn test_rev_iterator() { + let s = "ศไทย中华Việt Nam"; + let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; + + let mut pos = 0; + let it = s.chars().rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +fn test_to_lowercase_rev_iterator() { + let s = "AÖßÜ💩ΣΤΙΓΜΑΣDžfiİ"; + let v = ['\u{307}', 'i', 'fi', 'dž', 'σ', 'α', 'μ', 'γ', 'ι', 'τ', 'σ', '💩', 'ü', 'ß', 'ö', 'a']; + + let mut pos = 0; + let it = s.chars().flat_map(|c| c.to_lowercase()).rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +fn test_to_uppercase_rev_iterator() { + let s = "aößü💩στιγμαςDžfiᾀ"; + let v = + ['Ι', 'Ἀ', 'I', 'F', 'DŽ', 'Σ', 'Α', 'Μ', 'Γ', 'Ι', 'Τ', 'Σ', '💩', 'Ü', 'S', 'S', 'Ö', 'A']; + + let mut pos = 0; + let it = s.chars().flat_map(|c| c.to_uppercase()).rev(); + + for c in it { + assert_eq!(c, v[pos]); + pos += 1; + } + assert_eq!(pos, v.len()); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_chars_decoding() { + let mut bytes = [0; 4]; + for c in (0..0x110000).filter_map(std::char::from_u32) { + let s = c.encode_utf8(&mut bytes); + if Some(c) != s.chars().next() { + panic!("character {:x}={} does not decode correctly", c as u32, c); + } + } +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_chars_rev_decoding() { + let mut bytes = [0; 4]; + for c in (0..0x110000).filter_map(std::char::from_u32) { + let s = c.encode_utf8(&mut bytes); + if Some(c) != s.chars().rev().next() { + panic!("character {:x}={} does not decode correctly", c as u32, c); + } + } +} + +#[test] +fn test_iterator_clone() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.chars(); + it.next(); + assert!(it.clone().zip(it).all(|(x, y)| x == y)); +} + +#[test] +fn test_iterator_last() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.chars(); + it.next(); + assert_eq!(it.last(), Some('m')); +} + +#[test] +fn test_chars_debug() { + let s = "ศไทย中华Việt Nam"; + let c = s.chars(); + assert_eq!( + format!("{c:?}"), + r#"Chars(['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm'])"# + ); +} + +#[test] +fn test_bytesator() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let mut pos = 0; + + for b in s.bytes() { + assert_eq!(b, v[pos]); + pos += 1; + } +} + +#[test] +fn test_bytes_revator() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + let mut pos = v.len(); + + for b in s.bytes().rev() { + pos -= 1; + assert_eq!(b, v[pos]); + } +} + +#[test] +fn test_bytesator_nth() { + let s = "ศไทย中华Việt Nam"; + let v = [ + 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228, 184, 173, 229, 141, 142, + 86, 105, 225, 187, 135, 116, 32, 78, 97, 109, + ]; + + let mut b = s.bytes(); + assert_eq!(b.nth(2).unwrap(), v[2]); + assert_eq!(b.nth(10).unwrap(), v[10]); + assert_eq!(b.nth(200), None); +} + +#[test] +fn test_bytesator_count() { + let s = "ศไทย中华Việt Nam"; + + let b = s.bytes(); + assert_eq!(b.count(), 28) +} + +#[test] +fn test_bytesator_last() { + let s = "ศไทย中华Việt Nam"; + + let b = s.bytes(); + assert_eq!(b.last().unwrap(), 109) +} + +#[test] +fn test_char_indicesator() { + let s = "ศไทย中华Việt Nam"; + let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27]; + let v = ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']; + + let mut pos = 0; + let it = s.char_indices(); + + for c in it { + assert_eq!(c, (p[pos], v[pos])); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(pos, p.len()); +} + +#[test] +fn test_char_indices_revator() { + let s = "ศไทย中华Việt Nam"; + let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0]; + let v = ['m', 'a', 'N', ' ', 't', 'ệ', 'i', 'V', '华', '中', 'ย', 'ท', 'ไ', 'ศ']; + + let mut pos = 0; + let it = s.char_indices().rev(); + + for c in it { + assert_eq!(c, (p[pos], v[pos])); + pos += 1; + } + assert_eq!(pos, v.len()); + assert_eq!(pos, p.len()); +} + +#[test] +fn test_char_indices_last() { + let s = "ศไทย中华Việt Nam"; + let mut it = s.char_indices(); + it.next(); + assert_eq!(it.last(), Some((27, 'm'))); +} + +#[test] +fn test_splitn_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.splitn(4, ' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]); + + let split: Vec<&str> = data.splitn(4, |c: char| c == ' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]); + + // Unicode + let split: Vec<&str> = data.splitn(4, 'ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]); + + let split: Vec<&str> = data.splitn(4, |c: char| c == 'ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]); +} +*/ + +#[test] +fn test_split_char_iterator_no_trailing() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split('\n').collect(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb", ""]); + + /* + let split: Vec<&str> = data.split_terminator('\n').collect(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb"]); + */ +} + +/* +#[test] +fn test_split_char_iterator_inclusive() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split_inclusive('\n').collect(); + assert_eq!(split, ["\n", "Märy häd ä little lämb\n", "Little lämb\n"]); + + let uppercase_separated = "SheePSharKTurtlECaT"; + let mut first_char = true; + let split: Vec<&str> = uppercase_separated + .split_inclusive(|c: char| { + let split = !first_char && c.is_uppercase(); + first_char = split; + split + }) + .collect(); + assert_eq!(split, ["SheeP", "SharK", "TurtlE", "CaT"]); +} + +#[test] +fn test_split_char_iterator_inclusive_rev() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.split_inclusive('\n').rev().collect(); + assert_eq!(split, ["Little lämb\n", "Märy häd ä little lämb\n", "\n"]); + + // Note that the predicate is stateful and thus dependent + // on the iteration order. + // (A different predicate is needed for reverse iterator vs normal iterator.) + // Not sure if anything can be done though. + let uppercase_separated = "SheePSharKTurtlECaT"; + let mut term_char = true; + let split: Vec<&str> = uppercase_separated + .split_inclusive(|c: char| { + let split = term_char && c.is_uppercase(); + term_char = c.is_uppercase(); + split + }) + .rev() + .collect(); + assert_eq!(split, ["CaT", "TurtlE", "SharK", "SheeP"]); +} + +#[test] +fn test_rsplit() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.rsplit(' ').collect(); + assert_eq!(split, ["lämb\n", "lämb\nLittle", "little", "ä", "häd", "\nMäry"]); + + let split: Vec<&str> = data.rsplit("lämb").collect(); + assert_eq!(split, ["\n", "\nLittle ", "\nMäry häd ä little "]); + + let split: Vec<&str> = data.rsplit(|c: char| c == 'ä').collect(); + assert_eq!(split, ["mb\n", "mb\nLittle l", " little l", "d ", "ry h", "\nM"]); +} + +#[test] +fn test_rsplitn() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&str> = data.rsplitn(2, ' ').collect(); + assert_eq!(split, ["lämb\n", "\nMäry häd ä little lämb\nLittle"]); + + let split: Vec<&str> = data.rsplitn(2, "lämb").collect(); + assert_eq!(split, ["\n", "\nMäry häd ä little lämb\nLittle "]); + + let split: Vec<&str> = data.rsplitn(2, |c: char| c == 'ä').collect(); + assert_eq!(split, ["mb\n", "\nMäry häd ä little lämb\nLittle l"]); +} + +#[test] +fn test_split_once() { + assert_eq!("".split_once("->"), None); + assert_eq!("-".split_once("->"), None); + assert_eq!("->".split_once("->"), Some(("", ""))); + assert_eq!("a->".split_once("->"), Some(("a", ""))); + assert_eq!("->b".split_once("->"), Some(("", "b"))); + assert_eq!("a->b".split_once("->"), Some(("a", "b"))); + assert_eq!("a->b->c".split_once("->"), Some(("a", "b->c"))); + assert_eq!("---".split_once("--"), Some(("", "-"))); +} + +#[test] +fn test_rsplit_once() { + assert_eq!("".rsplit_once("->"), None); + assert_eq!("-".rsplit_once("->"), None); + assert_eq!("->".rsplit_once("->"), Some(("", ""))); + assert_eq!("a->".rsplit_once("->"), Some(("a", ""))); + assert_eq!("->b".rsplit_once("->"), Some(("", "b"))); + assert_eq!("a->b".rsplit_once("->"), Some(("a", "b"))); + assert_eq!("a->b->c".rsplit_once("->"), Some(("a->b", "c"))); + assert_eq!("---".rsplit_once("--"), Some(("-", ""))); +} + +#[test] +fn test_split_whitespace() { + let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n"; + let words: Vec<&str> = data.split_whitespace().collect(); + assert_eq!(words, ["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"]) +} + +#[test] +fn test_lines() { + let data = "\nMäry häd ä little lämb\n\r\nLittle lämb\n"; + let lines: Vec<&str> = data.lines().collect(); + assert_eq!(lines, ["", "Märy häd ä little lämb", "", "Little lämb"]); + + let data = "\r\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n + let lines: Vec<&str> = data.lines().collect(); + assert_eq!(lines, ["", "Märy häd ä little lämb", "", "Little lämb"]); +} + */ + +#[test] +fn test_splitator() { + fn t(s: &str, sep: &str, u: &[&str]) { + let want: Vec<&OsStr> = u.into_iter().map(|&v| os(v)).collect(); + let got: Vec<&OsStr> = os(s).split(sep).collect(); + assert_eq!(want, got); + } + t("--1233345--", "12345", &["--1233345--"]); + t("abc::hello::there", "::", &["abc", "hello", "there"]); + t("::hello::there", "::", &["", "hello", "there"]); + t("hello::there::", "::", &["hello", "there", ""]); + t("::hello::there::", "::", &["", "hello", "there", ""]); + t("ประเทศไทย中华Việt Nam", "中华", &["ประเทศไทย", "Việt Nam"]); + t("zzXXXzzYYYzz", "zz", &["", "XXX", "YYY", ""]); + t("zzXXXzYYYz", "XXX", &["zz", "zYYYz"]); + t(".XXX.YYY.", ".", &["", "XXX", "YYY", ""]); + t("", ".", &[""]); + t("zz", "zz", &["", ""]); + t("ok", "z", &["ok"]); + t("zzz", "zz", &["", "z"]); + t("zzzzz", "zz", &["", "", "z"]); +} + +#[test] +fn test_str_default() { + use std::default::Default; + + fn t>() { + let s: S = Default::default(); + assert_eq!(s.as_ref(), os("")); + } + + t::<&str>(); + t::(); + t::<&mut str>(); +} + +#[test] +fn test_str_container() { + fn sum_len(v: &[&str]) -> usize { + v.iter().map(|x| os(x).len()).sum() + } + + assert_eq!(5, sum_len(&["012", "", "34"])); + assert_eq!(5, sum_len(&["01", "2", "34", ""])); + assert_eq!(5, sum_len(&["01234"])); +} + +/* +#[test] +fn test_str_from_utf8() { + let xs = b"hello"; + assert_eq!(from_utf8(xs), Ok("hello")); + + let xs = "ศไทย中华Việt Nam".as_bytes(); + assert_eq!(from_utf8(xs), Ok("ศไทย中华Việt Nam")); + + let xs = b"hello\xFF"; + assert!(from_utf8(xs).is_err()); +} + */ + +#[test] +fn test_pattern_deref_forward() { + let data = "aabcdaa"; + assert!(data.contains("bcd")); + assert!(data.contains(&"bcd")); + assert!(data.contains(&"bcd".to_string())); +} + +#[test] +fn test_empty_match_indices() { + let data = "aä中!"; + let mut searcher = "".into_searcher(os(data)); + let got: Vec = core::iter::from_fn(|| searcher.next_match()) + .map(|(start, _)| start) + .collect(); + assert_eq!(got, [0, 1, 3, 6, 7]); +} + +fn check_contains_all_substrings(haystack: &str) { + let mut modified_needle = String::new(); + + for i in 0..haystack.len() { + // check different haystack lengths since we special-case short haystacks. + let haystack = &haystack[0..i]; + assert!(contains(haystack, "")); + for j in 0..haystack.len() { + for k in j + 1..=haystack.len() { + let needle = &haystack[j..k]; + assert!(contains(haystack, needle)); + modified_needle.clear(); + modified_needle.push_str(needle); + modified_needle.replace_range(0..1, "\0"); + assert!(!contains(haystack, &*modified_needle)); + + modified_needle.clear(); + modified_needle.push_str(needle); + modified_needle.replace_range(needle.len() - 1..needle.len(), "\0"); + assert!(!contains(haystack, &*modified_needle)); + } + } + } +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn strslice_issue_16589() { + assert!(contains("bananas", "nana")); + + // prior to the fix for #16589, x.contains("abcdabcd") returned false + // test all substrings for good measure + check_contains_all_substrings("012345678901234567890123456789bcdabcdabcd"); +} + +#[test] +fn strslice_issue_16878() { + assert!(!contains("1234567ah012345678901ah", "hah")); + assert!(!contains("00abc01234567890123456789abc", "bcabc")); +} + +#[test] +fn strslice_issue_104726() { + // Edge-case in the simd_contains impl. + // The first and last byte are the same so it backtracks by one byte + // which aligns with the end of the string. Previously incorrect offset calculations + // lead to out-of-bounds slicing. + #[rustfmt::skip] + let needle = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaba"; + let haystack = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"; + assert!(!contains(haystack, needle)); +} + +#[test] +#[cfg_attr(miri, ignore)] // Miri is too slow +fn test_strslice_contains() { + let x = "There are moments, Jeeves, when one asks oneself, 'Do trousers matter?'"; + check_contains_all_substrings(x); +} + +/* +#[test] +fn test_rsplitn_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: Vec<&str> = data.rsplitn(4, ' ').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut split: Vec<&str> = data.rsplitn(4, |c: char| c == ' ').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]); + + // Unicode + let mut split: Vec<&str> = data.rsplitn(4, 'ä').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); + + let mut split: Vec<&str> = data.rsplitn(4, |c: char| c == 'ä').collect(); + split.reverse(); + assert_eq!(split, ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]); +} +*/ + +#[test] +fn test_split_char_iterator() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let split: Vec<&OsStr> = os(data).split(' ').collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(' ').rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let split: Vec<&OsStr> = os(data).split(predicate(|c: char| c == ' ')).collect(); + assert_eq!(split, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(predicate(|c: char| c == ' ')).rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]); + + // Unicode + let split: Vec<&OsStr> = os(data).split('ä').collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split('ä').rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let split: Vec<&OsStr> = os(data).split(predicate(|c: char| c == 'ä')).collect(); + assert_eq!(split, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); + + let mut rsplit: Vec<&OsStr> = os(data).split(predicate(|c: char| c == 'ä')).rev().collect(); + rsplit.reverse(); + assert_eq!(rsplit, ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]); +} + +#[test] +fn test_rev_split_char_iterator_no_trailing() { + let data = "\nMäry häd ä little lämb\nLittle lämb\n"; + + let mut split: Vec<&OsStr> = os(data).split('\n').rev().collect(); + split.reverse(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb", ""]); +/* + let mut split: Vec<&OsStr> = os(data).split_terminator('\n').rev().collect(); + split.reverse(); + assert_eq!(split, ["", "Märy häd ä little lämb", "Little lämb"]); +*/ +} + +/* +#[test] +fn test_utf16_code_units() { + assert_eq!("é\u{1F4A9}".encode_utf16().collect::>(), [0xE9, 0xD83D, 0xDCA9]) +} + */ + +#[test] +fn starts_with_in_unicode() { + assert!(!os("├── Cargo.toml").starts_with("# ")); +} + +#[test] +fn starts_short_long() { + assert!(!os("").starts_with("##")); + assert!(!os("##").starts_with("####")); + assert!(os("####").starts_with("##")); + assert!(!os("##ä").starts_with("####")); + assert!(os("####ä").starts_with("##")); + assert!(!os("##").starts_with("####ä")); + assert!(os("##ä##").starts_with("##ä")); + + assert!(os("").starts_with("")); + assert!(os("ä").starts_with("")); + assert!(os("#ä").starts_with("")); + assert!(os("##ä").starts_with("")); + assert!(os("ä###").starts_with("")); + assert!(os("#ä##").starts_with("")); + assert!(os("##ä#").starts_with("")); +} + +#[test] +fn contains_weird_cases() { + assert!(contains("* \t", ' ')); + assert!(!contains("* \t", '?')); + assert!(!contains("* \t", '\u{1F4A9}')); +} + +/* +#[test] +fn trim_ws() { + assert_eq!(" \t a \t ".trim_start_matches(|c: char| c.is_whitespace()), "a \t "); + assert_eq!(" \t a \t ".trim_end_matches(|c: char| c.is_whitespace()), " \t a"); + assert_eq!(" \t a \t ".trim_start_matches(|c: char| c.is_whitespace()), "a \t "); + assert_eq!(" \t a \t ".trim_end_matches(|c: char| c.is_whitespace()), " \t a"); + assert_eq!(" \t a \t ".trim_matches(|c: char| c.is_whitespace()), "a"); + assert_eq!(" \t \t ".trim_start_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_end_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_start_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_end_matches(|c: char| c.is_whitespace()), ""); + assert_eq!(" \t \t ".trim_matches(|c: char| c.is_whitespace()), ""); +} + +#[test] +fn to_lowercase() { + assert_eq!("".to_lowercase(), ""); + assert_eq!("AÉDžaé ".to_lowercase(), "aédžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + assert_eq!("ΑΣ".to_lowercase(), "ας"); + assert_eq!("Α'Σ".to_lowercase(), "α'ς"); + assert_eq!("Α''Σ".to_lowercase(), "α''ς"); + + assert_eq!("ΑΣ Α".to_lowercase(), "ας α"); + assert_eq!("Α'Σ Α".to_lowercase(), "α'ς α"); + assert_eq!("Α''Σ Α".to_lowercase(), "α''ς α"); + + assert_eq!("ΑΣ' Α".to_lowercase(), "ας' α"); + assert_eq!("ΑΣ'' Α".to_lowercase(), "ας'' α"); + + assert_eq!("Α'Σ' Α".to_lowercase(), "α'ς' α"); + assert_eq!("Α''Σ'' Α".to_lowercase(), "α''ς'' α"); + + assert_eq!("Α Σ".to_lowercase(), "α σ"); + assert_eq!("Α 'Σ".to_lowercase(), "α 'σ"); + assert_eq!("Α ''Σ".to_lowercase(), "α ''σ"); + + assert_eq!("Σ".to_lowercase(), "σ"); + assert_eq!("'Σ".to_lowercase(), "'σ"); + assert_eq!("''Σ".to_lowercase(), "''σ"); + + assert_eq!("ΑΣΑ".to_lowercase(), "ασα"); + assert_eq!("ΑΣ'Α".to_lowercase(), "ασ'α"); + assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + let lower = str::repeat("i̇", 512); + assert_eq!(upper.to_lowercase(), lower); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + let lower = str::repeat("a", 511); + assert_eq!(upper.to_lowercase(), lower); +} + +#[test] +fn to_uppercase() { + assert_eq!("".to_uppercase(), ""); + assert_eq!("aéDžßfiᾀ".to_uppercase(), "AÉDŽSSFIἈΙ"); +} +*/ + +#[test] +fn test_into_string() { + // The only way to acquire a Box in the first place is through + // a OsString, so just test that we can round-trip between Box and + // OsString. + let string = OsString::from("Some text goes here"); + assert_eq!(string.clone().into_boxed_os_str().into_os_string(), string); +} + +#[test] +fn test_box_slice_clone() { + let data = OsString::from("hello HELLO hello HELLO yes YES 5 中ä华!!!"); + let data2 = data.clone().into_boxed_os_str().clone().into_os_string(); + + assert_eq!(data, data2); +} + +#[test] +fn test_cow_from() { + let borrowed = os("borrowed"); + let owned = OsString::from("owned"); + match (Cow::from(owned.clone()), Cow::from(borrowed)) { + (Cow::Owned(o), Cow::Borrowed(b)) => assert!(o == owned && b == borrowed), + _ => panic!("invalid `Cow::from`"), + } +} + +/* +#[test] +fn test_repeat() { + assert_eq!("".repeat(3), ""); + assert_eq!("abc".repeat(0), ""); + assert_eq!("α".repeat(3), "ααα"); +} +*/ + +mod pattern { + use core::pattern::SearchStep::{self, Done, Match, Reject}; + use core::pattern::{Pattern, ReverseSearcher, Searcher}; + use super::*; + + macro_rules! make_test { + ($name:ident, $p:expr, $h:expr, [$($e:expr,)*]) => { + #[allow(unused_imports)] + mod $name { + use core::pattern::SearchStep::{Match, Reject}; + use super::{cmp_search_to_vec}; + #[test] + fn fwd() { + cmp_search_to_vec(false, $p, $h, vec![$($e),*]); + } + #[test] + fn bwd() { + cmp_search_to_vec(true, $p, $h, vec![$($e),*]); + } + } + } + } + + fn cmp_search_to_vec<'a>( + rev: bool, + pat: impl Pattern<&'a OsStr, Searcher: ReverseSearcher<&'a OsStr>>, + haystack: &'a str, + right: Vec, + ) { + let mut searcher = pat.into_searcher(os(haystack)); + let mut v = vec![]; + loop { + match if !rev { searcher.next() } else { searcher.next_back() } { + Match(a, b) => v.push(Match(a, b)), + Reject(a, b) => v.push(Reject(a, b)), + Done => break, + } + } + if rev { + v.reverse(); + } + + let mut first_index = 0; + let mut err = None; + + for (i, e) in right.iter().enumerate() { + match *e { + Match(a, b) | Reject(a, b) if a <= b && a == first_index => { + first_index = b; + } + _ => { + err = Some(i); + break; + } + } + } + + if let Some(err) = err { + panic!("Input skipped range at {err}"); + } + + if first_index != haystack.len() { + panic!("Did not cover whole input"); + } + + assert_eq!(v, right); + } + + make_test!( + str_searcher_ascii_haystack, + "bb", + "abbcbbd", + [Reject(0, 1), Match(1, 3), Reject(3, 4), Match(4, 6), Reject(6, 7),] + ); + make_test!( + str_searcher_ascii_haystack_seq, + "bb", + "abbcbbbbd", + [Reject(0, 1), Match(1, 3), Reject(3, 4), Match(4, 6), Match(6, 8), Reject(8, 9),] + ); + make_test!( + str_searcher_empty_needle_ascii_haystack, + "", + "abbcbbd", + [ + Match(0, 0), + Reject(0, 1), + Match(1, 1), + Reject(1, 2), + Match(2, 2), + Reject(2, 3), + Match(3, 3), + Reject(3, 4), + Match(4, 4), + Reject(4, 5), + Match(5, 5), + Reject(5, 6), + Match(6, 6), + Reject(6, 7), + Match(7, 7), + ] + ); + make_test!( + str_searcher_multibyte_haystack, + " ", + "├──", + [Reject(0, 9),] + ); + make_test!( + str_searcher_empty_needle_multibyte_haystack, + "", + "├──", + [ + Match(0, 0), + Reject(0, 3), + Match(3, 3), + Reject(3, 6), + Match(6, 6), + Reject(6, 9), + Match(9, 9), + ] + ); + make_test!(str_searcher_empty_needle_empty_haystack, "", "", [Match(0, 0),]); + make_test!(str_searcher_nonempty_needle_empty_haystack, "├", "", []); + make_test!( + char_searcher_ascii_haystack, + 'b', + "abbcbbd", + [ + Reject(0, 1), + Match(1, 2), + Match(2, 3), + Reject(3, 4), + Match(4, 5), + Match(5, 6), + Reject(6, 7), + ] + ); + make_test!( + char_searcher_multibyte_haystack, + ' ', + "├──", + [Reject(0, 9),] + ); + make_test!( + char_searcher_short_haystack, + '\u{1F4A9}', + "* \t", + [Reject(0, 3),] + ); + + // See #85462 + #[test] + fn str_searcher_empty_needle_after_done() { + // Empty needle and haystack + { + let mut searcher = "".into_searcher(os("")); + + assert_eq!(searcher.next(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + + let mut searcher = "".into_searcher(os("")); + + assert_eq!(searcher.next_back(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + } + // Empty needle and non-empty haystack + { + let mut searcher = "".into_searcher(os("a")); + + assert_eq!(searcher.next(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next(), SearchStep::Reject(0, 1)); + assert_eq!(searcher.next(), SearchStep::Match(1, 1)); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + assert_eq!(searcher.next(), SearchStep::Done); + + let mut searcher = "".into_searcher(os("a")); + + assert_eq!(searcher.next_back(), SearchStep::Match(1, 1)); + assert_eq!(searcher.next_back(), SearchStep::Reject(0, 1)); + assert_eq!(searcher.next_back(), SearchStep::Match(0, 0)); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + assert_eq!(searcher.next_back(), SearchStep::Done); + } + } +} + +macro_rules! generate_iterator_test { + { + $name:ident { + $( + ($($arg:expr),*) -> [$($t:tt)*]; + )* + } + with $fwd:expr, $bwd:expr; + } => { + #[test] + fn $name() { + $( + { + let res = vec![$($t)*]; + + let fwd_vec: Vec<_> = ($fwd)($($arg),*).collect(); + assert_eq!(fwd_vec, res); + + let mut bwd_vec: Vec<_> = ($bwd)($($arg),*).collect(); + bwd_vec.reverse(); + assert_eq!(bwd_vec, res); + } + )* + } + }; + { + $name:ident { + $( + ($($arg:expr),*) -> [$($t:tt)*]; + )* + } + with $fwd:expr; + } => { + #[test] + fn $name() { + $( + { + let want: Vec<_> = [$($t)*].into_iter().map(os).collect(); + + let fwd_vec: Vec<_> = ($fwd)($($arg),*).collect(); + assert_eq!(fwd_vec, want); + } + )* + } + } +} + +generate_iterator_test! { + double_ended_split { + (os("foo.bar.baz"), '.') -> ["foo", "bar", "baz"]; + (os("foo::bar::baz"), "::") -> ["foo", "bar", "baz"]; + } + with OsStr::split /*, str::rsplit */; +} + +/* +generate_iterator_test! { + double_ended_split_terminator { + ("foo;bar;baz;", ';') -> ["foo", "bar", "baz"]; + } + with str::split_terminator, str::rsplit_terminator; +} + +generate_iterator_test! { + double_ended_matches { + ("a1b2c3", char::is_numeric) -> ["1", "2", "3"]; + } + with str::matches, str::rmatches; +} + +generate_iterator_test! { + double_ended_match_indices { + ("a1b2c3", char::is_numeric) -> [(1, "1"), (3, "2"), (5, "3")]; + } + with str::match_indices, str::rmatch_indices; +} + +generate_iterator_test! { + not_double_ended_splitn { + ("foo::bar::baz", 2, "::") -> ["foo", "bar::baz"]; + } + with str::splitn; +} + +generate_iterator_test! { + not_double_ended_rsplitn { + ("foo::bar::baz", 2, "::") -> ["baz", "foo::bar"]; + } + with str::rsplitn; +} +*/ + +/* +#[test] +fn different_str_pattern_forwarding_lifetimes() { + use core::pattern::Pattern; + + fn foo<'a, P>(p: P) + where + for<'b> &'b P: Pattern<&'a OsStr>, + { + for _ in 0..3 { + os("asdf").find(&p); + } + } + + foo::<&str>("x"); +} +*/ + +/* +#[test] +fn test_str_multiline() { + let a: String = "this \ +is a test" + .to_string(); + let b: String = "this \ + is \ + another \ + test" + .to_string(); + assert_eq!(a, "this is a test".to_string()); + assert_eq!(b, "this is another test".to_string()); +} + +#[test] +fn test_str_escapes() { + let x = "\\\\\ + "; + assert_eq!(x, r"\\"); // extraneous whitespace stripped +} + +#[test] +fn const_str_ptr() { + const A: [u8; 2] = ['h' as u8, 'i' as u8]; + const B: &'static [u8; 2] = &A; + const C: *const u8 = B as *const u8; + + // Miri does not deduplicate consts (https://github.com/rust-lang/miri/issues/131) + #[cfg(not(miri))] + { + let foo = &A as *const u8; + assert_eq!(foo, C); + } + + unsafe { + assert_eq!(from_utf8_unchecked(&A), "hi"); + assert_eq!(*C, A[0]); + assert_eq!(*(&B[0] as *const u8), A[0]); + } +} + +#[test] +fn utf8() { + let yen: char = '¥'; // 0xa5 + let c_cedilla: char = 'ç'; // 0xe7 + let thorn: char = 'þ'; // 0xfe + let y_diaeresis: char = 'ÿ'; // 0xff + let pi: char = 'Π'; // 0x3a0 + + assert_eq!(yen as isize, 0xa5); + assert_eq!(c_cedilla as isize, 0xe7); + assert_eq!(thorn as isize, 0xfe); + assert_eq!(y_diaeresis as isize, 0xff); + assert_eq!(pi as isize, 0x3a0); + + assert_eq!(pi as isize, '\u{3a0}' as isize); + assert_eq!('\x0a' as isize, '\n' as isize); + + let bhutan: String = "འབྲུག་ཡུལ།".to_string(); + let japan: String = "日本".to_string(); + let uzbekistan: String = "Ўзбекистон".to_string(); + let austria: String = "Österreich".to_string(); + + let bhutan_e: String = + "\u{f60}\u{f56}\u{fb2}\u{f74}\u{f42}\u{f0b}\u{f61}\u{f74}\u{f63}\u{f0d}".to_string(); + let japan_e: String = "\u{65e5}\u{672c}".to_string(); + let uzbekistan_e: String = + "\u{40e}\u{437}\u{431}\u{435}\u{43a}\u{438}\u{441}\u{442}\u{43e}\u{43d}".to_string(); + let austria_e: String = "\u{d6}sterreich".to_string(); + + let oo: char = 'Ö'; + assert_eq!(oo as isize, 0xd6); + + fn check_str_eq(a: String, b: String) { + let mut i: isize = 0; + for ab in a.bytes() { + println!("{i}"); + println!("{ab}"); + let bb: u8 = b.as_bytes()[i as usize]; + println!("{bb}"); + assert_eq!(ab, bb); + i += 1; + } + } + + check_str_eq(bhutan, bhutan_e); + check_str_eq(japan, japan_e); + check_str_eq(uzbekistan, uzbekistan_e); + check_str_eq(austria, austria_e); +} + +#[test] +fn utf8_chars() { + // Chars of 1, 2, 3, and 4 bytes + let chs: Vec = vec!['e', 'é', '€', '\u{10000}']; + let s: String = chs.iter().cloned().collect(); + let schs: Vec = s.chars().collect(); + + assert_eq!(s.len(), 10); + assert_eq!(s.chars().count(), 4); + assert_eq!(schs.len(), 4); + assert_eq!(schs.iter().cloned().collect::(), s); + + assert!((from_utf8(s.as_bytes()).is_ok())); + // invalid prefix + assert!((!from_utf8(&[0x80]).is_ok())); + // invalid 2 byte prefix + assert!((!from_utf8(&[0xc0]).is_ok())); + assert!((!from_utf8(&[0xc0, 0x10]).is_ok())); + // invalid 3 byte prefix + assert!((!from_utf8(&[0xe0]).is_ok())); + assert!((!from_utf8(&[0xe0, 0x10]).is_ok())); + assert!((!from_utf8(&[0xe0, 0xff, 0x10]).is_ok())); + // invalid 4 byte prefix + assert!((!from_utf8(&[0xf0]).is_ok())); + assert!((!from_utf8(&[0xf0, 0x10]).is_ok())); + assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok())); + assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok())); +} + +#[test] +fn utf8_char_counts() { + let strs = [("e", 1), ("é", 1), ("€", 1), ("\u{10000}", 1), ("eé€\u{10000}", 4)]; + let spread = if cfg!(miri) { 4 } else { 8 }; + let mut reps = [8, 64, 256, 512] + .iter() + .copied() + .flat_map(|n| n - spread..=n + spread) + .collect::>(); + if cfg!(not(miri)) { + reps.extend([1024, 1 << 16].iter().copied().flat_map(|n| n - spread..=n + spread)); + } + let counts = if cfg!(miri) { 0..1 } else { 0..8 }; + let padding = counts.map(|len| " ".repeat(len)).collect::>(); + + for repeat in reps { + for (tmpl_str, tmpl_char_count) in strs { + for pad_start in &padding { + for pad_end in &padding { + // Create a string with padding... + let with_padding = + format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end); + // ...and then skip past that padding. This should ensure + // that we test several different alignments for both head + // and tail. + let si = pad_start.len(); + let ei = with_padding.len() - pad_end.len(); + let target = &with_padding[si..ei]; + + assert!(!target.starts_with(" ") && !target.ends_with(" ")); + let expected_count = tmpl_char_count * repeat; + assert_eq!( + expected_count, + target.chars().count(), + "wrong count for `{:?}.repeat({})` (padding: `{:?}`)", + tmpl_str, + repeat, + (pad_start.len(), pad_end.len()), + ); + } + } + } + } +} + +#[test] +fn floor_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.floor_char_boundary(idx), + ret, + "{:?}.floor_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1, isize::MAX as usize, usize::MAX], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", 2..4, 2); + + // 2-byte chars + check_many("ĵƥ", 0..2, 0); + check_many("ĵƥ", 2..4, 2); + check_many("ĵƥ", 4..6, 4); + + // 3-byte chars + check_many("日本", 0..3, 0); + check_many("日本", 3..6, 3); + check_many("日本", 6..8, 6); + + // 4-byte chars + check_many("🇯🇵", 0..4, 0); + check_many("🇯🇵", 4..8, 4); + check_many("🇯🇵", 8..10, 8); +} + +#[test] +fn ceil_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.ceil_char_boundary(idx), + ret, + "{:?}.ceil_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", [2], 2); + + // 2-byte chars + check_many("ĵƥ", 0..=0, 0); + check_many("ĵƥ", 1..=2, 2); + check_many("ĵƥ", 3..=4, 4); + + // 3-byte chars + check_many("日本", 0..=0, 0); + check_many("日本", 1..=3, 3); + check_many("日本", 4..=6, 6); + + // 4-byte chars + check_many("🇯🇵", 0..=0, 0); + check_many("🇯🇵", 1..=4, 4); + check_many("🇯🇵", 5..=8, 8); +} + +#[test] +#[should_panic] +fn ceil_char_boundary_above_len_panic() { + let _ = "x".ceil_char_boundary(2); +} +*/ diff --git a/tests/rustdoc/async-fn.rs b/tests/rustdoc/async-fn.rs index 70bcbcb6ff44a..3f641473d308a 100644 --- a/tests/rustdoc/async-fn.rs +++ b/tests/rustdoc/async-fn.rs @@ -46,7 +46,7 @@ impl Foo { pub async fn mut_self(mut self, mut first: usize) {} } -pub trait Pattern<'a> {} +pub trait Pattern<&'a str> {} pub trait Trait {} // @has async_fn/fn.const_generics.html @@ -91,5 +91,5 @@ impl Foo { // @has - '//pre[@class="rust item-decl"]' "pub async fn named<'a, 'b>(foo: &'a str) -> &'b str" pub async fn named<'a, 'b>(foo: &'a str) -> &'b str {} // @has async_fn/fn.named_trait.html -// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b>" -pub async fn named_trait<'a, 'b>(foo: impl Pattern<'a>) -> impl Pattern<'b> {} +// @has - '//pre[@class="rust item-decl"]' "pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b>" +pub async fn named_trait<'a, 'b>(foo: impl Pattern<&'a str>) -> impl Pattern<'b> {} diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs index f9a9347641143..8a047a082c4a4 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.rs @@ -3,7 +3,7 @@ fn strip_lf(s: &str) -> &str { //~^ ERROR expected a `FnMut<(char,)>` closure, found `u8` //~| NOTE expected an `FnMut<(char,)>` closure, found `u8` //~| HELP the trait `FnMut<(char,)>` is not implemented for `u8` - //~| HELP the following other types implement trait `Pattern<'a>`: + //~| HELP the following other types implement trait `Pattern<&'a str>`: //~| NOTE required for `u8` to implement `Pattern<'_>` } diff --git a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr index ce9ab2d811ae1..e97aaa6834309 100644 --- a/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr +++ b/tests/ui/traits/bound/assoc-fn-bound-root-obligation.stderr @@ -5,7 +5,7 @@ LL | s.strip_suffix(b'\n').unwrap_or(s) | ^^^^^^^^^^^^ expected an `FnMut<(char,)>` closure, found `u8` | = help: the trait `FnMut<(char,)>` is not implemented for `u8` - = help: the following other types implement trait `Pattern<'a>`: + = help: the following other types implement trait `Pattern<&'a str>`: &'b String &'b [char; N] &'b [char]