From ee0def25e14e125137648dbac7651d892fd99f62 Mon Sep 17 00:00:00 2001 From: kennytm Date: Tue, 16 Jan 2018 20:56:39 +0800 Subject: [PATCH 1/7] OsStr pattern. --- text/0000-os-str-pattern.md | 452 ++++++++++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 text/0000-os-str-pattern.md diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md new file mode 100644 index 00000000000..e4002923376 --- /dev/null +++ b/text/0000-os-str-pattern.md @@ -0,0 +1,452 @@ +- Feature Name: `os_str_pattern` +- Start Date: 2018-01-16 +- RFC PR: (leave this empty) +- Rust Issue: (leave this empty) + +# Summary +[summary]: #summary + +Generalize the WTF-8 encoding to allow `OsStr` to use the pattern API methods. + +# Motivation +[motivation]: #motivation + +`OsStr` is missing many common string methods compared to the standard `str` or even `[u8]`. There +have been numerous attempts to expand the API surface, the latest one being [RFC #1309], which +leads to an attempt to [revamp the `std::pattern::Pattern` API][Kimundi/rust_pattern_api_v2], but +eventually closed due to inactivity and lack of resource. + +Over the past several years, there has been numerous requests and attempts to implement these +missing functions in particular `OsStr::starts_with` ([1][#22741], [2][#26499], [3][#40300], +[4][urlo #10403], [5][irlo #6277], [6][os-str-generic]). + +The main difficulty applying `str` APIs to `OsStr` is [WTF-8]. A surrogate pair (e.g. U+10000 = +`d800 dc00`) is encoded as a 4-byte sequence (`f0 90 80 80`) similar to UTF-8, but an unpaired +surrogate (e.g. U+D800 alone) is encoded as a completely distinct 3-byte sequence (`ed a0 80`). +Naively extending the slice-based pattern API will not work, e.g. you cannot find any `ed a0 80` +inside `f0 90 80 80`, so `.starts_with()` is going to be more complex, and `.split()` certainly +cannot borrow a well-formed WTF-8 slice from it. + +The solution proposed by RFC #1309 is to create two sets of APIs. One, `.contains_os()`, +`.starts_with_os()`, `.ends_with_os()` and `.replace()` which do not require borrowing, will support +using `&OsStr` as input. The rest like `.split()`, `.matches()` and `.trim()` which require +borrowing, will only accept UTF-8 strings as input. + +The โ€œpattern 2.0โ€ API does not split into two sets of APIs, but will panic when the search string +starts with or ends with an unpaired surrogate. + +We feel that these designs are not elegant enough. This RFC attempts to fix the problem by going one +level lower, by generalizing WTF-8 so that splitting a surrogate pair is allowed, so we could search +an `OsStr` with an `OsStr` using a single Pattern API without panicking. + +[Kimundi/rust_pattern_api_v2]: https://github.com/Kimundi/rust_pattern_api_v2 +[RFC #1309]: https://github.com/rust-lang/rfcs/pull/1309 +[#22741]: https://github.com/rust-lang/rust/issues/22741 +[#26499]: https://github.com/rust-lang/rust/issues/26499 +[#40300]: https://github.com/rust-lang/rust/issues/40300 +[urlo #10403]: https://users.rust-lang.org/t/comparing-osstr-for-prefixes-and-suffixes/10403 +[irlo #6277]: https://internals.rust-lang.org/t/make-std-os-unix-ffi-osstrext-cross-platform/6277 +[os-str-generic]: https://docs.rs/os-str-generic +[WTF-8]: https://simonsapin.github.io/wtf-8/ + +# Guide-level explanation +[guide-level-explanation]: #guide-level-explanation + +The following new methods are now available to `OsStr`. They behave the same as their counterpart in +`str`. + +```rust +impl OsStr { + pub fn contains<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<&'a Self>; + + pub fn starts_with<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<&'a Self>; + + pub fn ends_with<'a, P>(&'a self, pat: P) -> bool + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn find<'a, P>(&'a self, pat: P) -> Option + where + P: Pattern<&'a Self>; + + pub fn rfind<'a, P>(&'a self, pat: P) -> Option + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + /// Finds the first range of this string which contains the pattern. + /// + /// # Examples + /// + /// ```rust + /// let path = OsStr::new("/usr/bin/bash"); + /// let range = path.find_range("/b"); + /// assert_eq!(range, Some(4..6)); + /// assert_eq!(path[range.unwrap()], OsStr::new("/bin")); + /// ``` + pub fn find_range<'a, P>(&'a self, pat: P) -> Option> + where + P: Pattern<&'a Self>; + + /// Finds the last range of this string which contains the pattern. + /// + /// # Examples + /// + /// ```rust + /// let path = OsStr::new("/usr/bin/bash"); + /// let range = path.rfind_range("/b"); + /// assert_eq!(range, Some(8..10)); + /// assert_eq!(path[range.unwrap()], OsStr::new("/bin")); + /// ``` + pub fn rfind_range<'a, P>(&'a self, pat: P) -> Option> + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + // (Note: these should return a concrete iterator type instead of `impl Trait`. + // For ease of explanation the concrete type is not listed here.) + pub fn split<'a, P>(&'a self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>; + + pub fn rsplit<'a, P>(&'a self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn split_terminator<'a, P>(&'a self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>; + + pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn splitn<'a, P>(&'a self, n: usize, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>; + + pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn matches<'a, P>(&'a self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>; + + pub fn rmatches<'a, P>(&self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn match_indices<'a, P>(&self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>; + + pub fn rmatch_indices<'a, P>(&self, pat: P) -> impl Iterator + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + // this is new + pub fn match_ranges<'a, P>(&'a self, pat: P) -> impl Iterator, &'a Self)> + where + P: Pattern<&'a Self>; + + // this is new + pub fn rmatch_ranges<'a, P>(&'a self, pat: P) -> impl Iterator, &'a Self)> + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a Self + where + P: Pattern<&'a Self>, + P::Searcher: DoubleEndedSearcher<&'a Self>; + + pub fn trim_left_matches<'a, P>(&'a self, pat: P) -> &'a Self + where + P: Pattern<&'a Self>; + + pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a Self + where + P: Pattern<&'a Self>, + P::Searcher: ReverseSearcher<&'a Self>; + + pub fn replace<'a, P>(&'a self, from: P, to: &'a Self) -> Self::Owned + where + P: Pattern<&'a Self>; + + pub fn replacen<'a, P>(&'a self, from: P, to: &'a Self, count: usize) -> Self::Owned + where + P: Pattern<&'a Self>; +} +``` + +We also allow slicing an `OsStr`. + +```rust +impl Index for OsStr { ... } +impl Index> for OsStr { ... } +impl Index> for OsStr { ... } +impl Index> for OsStr { ... } +``` + +Example: + +```rust +// (assume we are on Windows) + +let path = OsStr::new(r"C:\Users\Admin\๐Ÿ˜€\๐Ÿ˜๐Ÿ˜‚๐Ÿ˜ƒ๐Ÿ˜„.txt"); +// can use starts_with, ends_with +assert!(path.starts_with(OsStr::new(r"C:\"))); +assert!(path.ends_with(OsStr::new(".txt")); +// can use rfind_range to get the range of substring +let last_backslash = path.rfind_range(OsStr::new(r"\")).unwrap(); +assert_eq!(last_backslash, 16..17); +// can perform slicing. +let file_name = &path[last_backslash.end..]; +// can perform splitting, even if it results in invalid Unicode! +let mut parts = file_name.split(&*OsString::from_wide(&[0xd83d])); +assert_eq!(parts.next(), Some(OsStr::new(""))); +assert_eq!(parts.next(), Some(&*OsString::from_wide(&[0xde01]))); +assert_eq!(parts.next(), Some(&*OsString::from_wide(&[0xde02]))); +assert_eq!(parts.next(), Some(&*OsString::from_wide(&[0xde03]))); +assert_eq!(parts.next(), Some(&*OsString::from_wide(&[0xde04, 0x2e, 0x74, 0x78, 0x74]))); +assert_eq!(parts.next(), None); +``` + +# Reference-level explanation +[reference-level-explanation]: #reference-level-explanation + +It is trivial to apply the pattern API to `OsStr` on platforms where it is just an `[u8]`. The main +difficulty is on Windows where it is an `[u16]` encoded as WTF-8. This RFC thus focuses on Windows +only. + +We will generalize the encoding of `OsStr` to specify these two capabilities: + +1. Slicing a surrogate pair in half: + + ```rust + let s = OsStr::new("\u{10000}"); + assert_eq!(&s[..2], &*OsString::from_wide(&[0xd800])); + assert_eq!(&s[2..], &*OsString::from_wide(&[0xdc00])); + ``` + +2. Finding a surrogate code point, no matter paired or unpaired: + + ```rust + let needle = OsString::from_wide(&[0xdc00]); + assert_eq!(OsStr::new("\u{10000}").find(&needle), Some(2)); + assert_eq!(OsString::from_wide(&[0x3f, 0xdc00]).find(&needle), Some(1)); + ``` + +These allow us to implement the โ€œPattern 1.5โ€ API for all `OsStr` without panicking. Implementation +detail can be found in the [`omgwtf8` package](https://github.com/kennytm/omgwtf8). + +## Slicing + +A surrogate pair is a 4-byte sequence in both UTF-8 and WTF-8. We support slicing it in half by +representing the high surrogate by the first 3 bytes, and the low surrogate by the last 3 bytes. + +``` +"\u{10000}" = f0 90 80 80 +"\u{10000}"[..2] = f0 90 80 +"\u{10000}"[2..] = 90 80 80 +``` + +Note that this means: + +1. `x[..i]` and `x[i..]` will have overlapping parts. This makes `OsStr::split_at_mut` (if exists) + unable to split a surrogate pair in half. This also means `Pattern<&mut OsStr>` cannot be + implemented for `&OsStr`. +2. The length of `x[..n]` may be longer than `n`. + +## Comparison and storage + +All `OsStr` strings with sliced 4-byte sequence can be converted back to proper WTF-8 with an O(1) +transformation: + +* If the string starts with `[\x80-\xbf]{3}`, replace these 3 bytes with the canonical low surrogate + encoding. +* If the string ends with `[\xf0-\xf4][\x80-\xbf]{2}`, replace these 3 bytes with the canonical high + surrogate encoding. + +We can this transformation โ€œ*canonicalization*โ€. + +All owned `OsStr` should be canonicalized to contain well-formed WTF-8 only: `Box`, +`Rc`, `Arc` and `OsString`. + +Two `OsStr` are compared equal if they have the same canonicalization. + +## Matching + +When an `OsStr` is used for matching, an unpaired low surrogate at the beginning and unpaired high +surrogate at the end must be replaced by regular expressions that match all pre-canonicalization +possibilities. For instance, matching for `xxxx\u{d9ab}` would create the following regex: + +``` +xxxx( + \xed\xa6\xab # canonical representation +| + \xf2\x86[\xb0-\xbf] # split representation +) +``` + +and matching for `\u{dcef}xxxx` with create the following regex: + +``` +( + \xed\xb3\xaf # canonical representation +| + [\x80-\xbf][\x83\x93\xa3\xb3]\xaf # split representation +)xxxx +``` + +After finding a match, if the end points to the middle of a 4-byte sequence, the search engine +should move backward by 2 bytes before continuing. This ensure searching for `\u{dc00}\u{d800}` in +`\u{10000}\u{10000}\u{10000}` will properly yield 2 matches. + +## Pattern API + +This RFC assumes a generalized pattern API which supports more than strings. If the pattern API is +not available, the new functions can take `&OsStr` instead of `impl Pattern<&OsStr>`, but this may +hurt future compatibility due to inference breakage. + +Assuming we do want to generalize the Pattern API, the implementor should note the issue of +splitting a surrogate pair: + +1. A match which starts with a low surrogate will point to byte 1 of the 4-byte sequence +2. An index always point to byte 2 of the 4-byte sequence +3. A match which ends with a high surrogate will point to byte 3 of the 4-byte sequence + +Implementation should note these different offsets when converting between different kinds of +cursors. In the [`omgwtf8::pattern` module](https://docs.rs/omgwtf8/*/omgwtf8/pattern/index.html), +this behavior is enforced by using distinct types for the start and end cursors. + +```rust +pub trait Pattern: Sized { + type Searcher: Searcher; + fn into_searcher(self, haystack: H) -> Self::Searcher; + fn is_contained_in(self, haystack: H) -> bool; + fn is_prefix_of(self, haystack: H) -> bool; + fn is_suffix_of(self, haystack: H) -> bool where Self::Searcher: ReverseSearcher; +} + +pub trait Searcher { + fn haystack(&self) -> H; + fn next_match(&mut self) -> Option<(H::StartCursor, H::EndCursor)>; + fn next_reject(&mut self) -> Option<(H::StartCursor, H::EndCursor)>; +} + +// equivalent to SearchPtrs in "Pattern API 1.5" +// and PatternHaystack in "Pattern API 2.0" +pub trait Haystack: Sized { + type StartCursor: Copy; + type EndCursor: Copy; + + // The following 5 methods are same as those in "Pattern API 1.5" + // except the cursor type is split into two. + fn cursor_at_front(hs: &Self) -> Self::StartCursor; + fn cursor_at_back(hs: &Self) -> Self::EndCursor; + unsafe fn start_cursor_to_offset(hs: &Self, cur: Self::StartCursor) -> usize; + unsafe fn end_cursor_to_offset(hs: &Self, cur: Self::EndCursor) -> usize; + unsafe fn range_to_self(hs: Self, start: Self::StartCursor, end: Self::EndCursor) -> Self; + + // Since a StartCursor and EndCursor may not be comparable, we also need this method + fn is_range_empty(start: Self::StartCursor, end: Self::EndCursor) -> bool; + + // And then we want to swap between the two cursor types + unsafe fn start_to_end_cursor(hs: &Self, cur: Self::StartCursor) -> Self::EndCursor; + unsafe fn end_to_start_cursor(hs: &Self, cur: Self::EndCursor) -> Self::StartCursor; +} +``` + +For `&OsStr`, we define both `StartCursor` and `EndCursor` as `*const u8`. + +The `start_to_end_cursor` function will return `cur + 2` if we find that `cur` points to the middle +of a 4-byte sequence. + +The `start_cursor_to_offset` function will return `cur - hs + 1` if we find that `cur` points to the +middle of a 4-byte sequenced. + +These type safety measures ensure functions utilizing a generic `Pattern` can get the correctly +overlapping slices when splitting a surrogate pair. + +```rust +// (actual code implementing `.split()`) +match self.matcher.next_match() { + Some((a, b)) => unsafe { + let haystack = self.matcher.haystack(); + let a = H::start_to_end_cursor(&haystack, a); + let b = H::end_to_start_cursor(&haystack, b); + let elt = H::range_to_self(haystack, self.start, a); + // ^ without `start_to_end_cursor`, the slice `elt` may be short by 2 bytes + self.start = b; + // ^ without `end_to_start_cursor`, the next starting position may skip 2 bytes + Some(elt) + }, + None => self.get_end(), +} +``` + +# Drawbacks +[drawbacks]: #drawbacks + +* **It breaks the invariant `x[..n].len() == n`.** + + Note that `OsStr` did not provide a slicing operator, and it already violated the invariant + `(x + y).len() == x.len() + y.len()`. + +* **A surrogate code point may be 2 or 3 indices long depending on context.** + + This means code using `x[i..(i+n)]` may give wrong result. + + ```rust + let needle = OsString::from_wide(&[0xdc00]); + let haystack = OsStr::new("\u{10000}a"); + let index = haystack.find(&needle).unwrap(); + let matched = &haystack[index..(index + needle.len()]; + // `matched` will contain "\u{dc00}a" instead of "\u{dc00}". + ``` + + As a workaround, we introduced `find_range` and `match_ranges`. Note that this is already a + problem to solve if we want to make `Regex` a pattern. + +# Rationale and alternatives +[alternatives]: #alternatives + +This is the only design which allows borrowing a sub-slice of a surrogate code point from a +surrogate pair. + +An alternative is keep using the vanilla WTF-8, and treat a surrogate pair as an atomic entity: +makes it impossible to split a surrogate pair after it is formed. The advantages are that + +* The pattern API becomes a simple substring search. +* Slicing behavior is consistent with `str`. + +There are two potential implementations when we want to match with an unpaired surrogate: + +1. **Declare that a surrogate pair does not contain the unpaired surrogate**, i.e. make + `"\u{10000}".find("\u{d800}")` return `None`. An unpaired surrogate can only be used to match + another unpaired surrogate. + + If we choose this, it means `x.find(z).is_some()` does not imply `(x + y).find(z).is_some()`. + +2. **Disallow matching when the pattern contains an unpaired surrogate at the boundary**, i.e. make + `"\u{10000}".find("\u{d800}")` panic. This is the approach chosen by โ€œPattern API 2.0โ€. + +Note that, for consistency, we need to make `"\u{10000}".starts_with("\u{d800}")` return `false` or +panic. + +# Unresolved questions +[unresolved]: #unresolved-questions + +None yet. \ No newline at end of file From 8b1171c7ec18f7374644618bba72c00c1595ccaa Mon Sep 17 00:00:00 2001 From: kennytm Date: Mon, 26 Feb 2018 03:36:28 +0800 Subject: [PATCH 2/7] Clarify the slicing operation. --- text/0000-os-str-pattern.md | 85 ++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md index e4002923376..6b3e24e72a6 100644 --- a/text/0000-os-str-pattern.md +++ b/text/0000-os-str-pattern.md @@ -227,8 +227,7 @@ assert_eq!(parts.next(), None); [reference-level-explanation]: #reference-level-explanation It is trivial to apply the pattern API to `OsStr` on platforms where it is just an `[u8]`. The main -difficulty is on Windows where it is an `[u16]` encoded as WTF-8. This RFC thus focuses on Windows -only. +difficulty is on Windows where it is an `[u16]` encoded as WTF-8. This RFC thus focuses on Windows. We will generalize the encoding of `OsStr` to specify these two capabilities: @@ -262,6 +261,9 @@ representing the high surrogate by the first 3 bytes, and the low surrogate by t "\u{10000}"[2..] = 90 80 80 ``` +The index splitting the surrogate pair will be positioned at the middle of the 4-byte sequence +(index "2" in the above example). + Note that this means: 1. `x[..i]` and `x[i..]` will have overlapping parts. This makes `OsStr::split_at_mut` (if exists) @@ -269,6 +271,34 @@ Note that this means: implemented for `&OsStr`. 2. The length of `x[..n]` may be longer than `n`. +### Platform-agnostic guarantees + +If an index points to an invalid position (e.g. `\u{1000}[1..]` or `"\u{10000}"[1..]` or +`"\u{10000}"[3..]`), a panic will be raised, similar to that of `str`. The following are guaranteed +to be valid positions on all platforms: + +* `0`. +* `self.len()`. +* The returned indices from `find()`, `rfind()`, `match_indices()` and `rmatch_indices()`. +* The returned ranges from `find_range()`, `rfind_range()`, `match_ranges()` and `rmatch_ranges()`. + +Index arithmetic is wrong for `OsStr`, i.e. `i + n` may not produce the correct index (see +[Drawbacks](#drawbacks)). + +For WTF-8 encoding on Windows, we define: + +* boundary of a character or surrogate byte sequence is Valid. +* middle (byte 2) of a 4-byte sequence is Valid. +* interior of a 2- or 3-byte sequence is Invalid. +* byte 1 or 3 of a 4-byte sequence is Invalid. + +Outside of Windows where the `OsStr` consists of arbitrary bytes, all numbers within +`0 ..= self.len()` are considered a valid index. This is because we want to allow +`os_str.find(OsStr::from_bytes(b"\xff"))`, and thus cannot use UTF-8 to reason with a Unix `OsStr`. + +Note that we have never guaranteed the actual `OsStr` encoding, these should only be considered an +implementation detail. + ## Comparison and storage All `OsStr` strings with sliced 4-byte sequence can be converted back to proper WTF-8 with an O(1) @@ -284,7 +314,9 @@ We can this transformation โ€œ*canonicalization*โ€. All owned `OsStr` should be canonicalized to contain well-formed WTF-8 only: `Box`, `Rc`, `Arc` and `OsString`. -Two `OsStr` are compared equal if they have the same canonicalization. +Two `OsStr` are compared equal if they have the same canonicalization. This may slightly reduce the +performance with a constant overhead, since there would be more checking involving the first and +last three bytes. ## Matching @@ -423,7 +455,9 @@ match self.matcher.next_match() { # Rationale and alternatives [alternatives]: #alternatives -This is the only design which allows borrowing a sub-slice of a surrogate code point from a +## Indivisible surrogate pair + +This RFC is the only design which allows borrowing a sub-slice of a surrogate code point from a surrogate pair. An alternative is keep using the vanilla WTF-8, and treat a surrogate pair as an atomic entity: @@ -446,7 +480,48 @@ There are two potential implementations when we want to match with an unpaired s Note that, for consistency, we need to make `"\u{10000}".starts_with("\u{d800}")` return `false` or panic. +## Slicing at real byte offset + +The current RFC defines the index that splits a surrogate pair into half at byte 2 of the 4-byte +sequence. This has the drawback of `"\u{10000}"[..2].len() == 3`, and caused index arithmetic to be +wrong. + +``` +"\u{10000}" = f0 90 80 80 +"\u{10000}"[..2] = f0 90 80 +"\u{10000}"[2..] = 90 80 80 +``` + +The main advantage of this scheme is we could use the same number as the start and end index. + +```rust +let s = OsStr::new("\u{10000}"); +assert_eq!(s.len(), 4); +let index = s.find('\u{dc00}').unwrap(); +let right = &s[index..]; // [90 80 80] +let left = &s[..index]; // [f0 90 80] +``` + +An alternative make the index refer to the real byte offsets: + +``` +"\u{10000}" = f0 90 80 80 +"\u{10000}"[..3] = f0 90 80 +"\u{10000}"[1..] = 90 80 80 +``` + +However the question would be, what should `s[..1]` do? + +* **Panic** โ€” But this means we cannot get `left`. We could inspect the raw bytes of `s` itself and + perform `&s[..(index + 2)]`, but we never explicitly exposed the encoding of `OsStr`, so we + cannot read a single byte and thus impossible to do this. + +* **Treat as same as `s[..3]`** โ€” But then this inherits all the disadvantages of using 2 as valid + index, plus we need to consider whether `s[1..3]` and `s[3..1]` should be valid. + +Given these, we decided not to treat the real byte offsets as valid indices. + # Unresolved questions [unresolved]: #unresolved-questions -None yet. \ No newline at end of file +None yet. From 7e1b03224e2549e7af7c3c9ed10de5144b9ddfaa Mon Sep 17 00:00:00 2001 From: kennytm Date: Tue, 13 Mar 2018 02:44:58 +0800 Subject: [PATCH 3/7] Fixed some typos. --- text/0000-os-str-pattern.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md index 6b3e24e72a6..d430545bdb3 100644 --- a/text/0000-os-str-pattern.md +++ b/text/0000-os-str-pattern.md @@ -87,7 +87,7 @@ impl OsStr { /// let path = OsStr::new("/usr/bin/bash"); /// let range = path.find_range("/b"); /// assert_eq!(range, Some(4..6)); - /// assert_eq!(path[range.unwrap()], OsStr::new("/bin")); + /// assert_eq!(path[range.unwrap()], OsStr::new("/b")); /// ``` pub fn find_range<'a, P>(&'a self, pat: P) -> Option> where @@ -101,7 +101,7 @@ impl OsStr { /// let path = OsStr::new("/usr/bin/bash"); /// let range = path.rfind_range("/b"); /// assert_eq!(range, Some(8..10)); - /// assert_eq!(path[range.unwrap()], OsStr::new("/bin")); + /// assert_eq!(path[range.unwrap()], OsStr::new("/b")); /// ``` pub fn rfind_range<'a, P>(&'a self, pat: P) -> Option> where @@ -445,7 +445,7 @@ match self.matcher.next_match() { let needle = OsString::from_wide(&[0xdc00]); let haystack = OsStr::new("\u{10000}a"); let index = haystack.find(&needle).unwrap(); - let matched = &haystack[index..(index + needle.len()]; + let matched = &haystack[index..(index + needle.len())]; // `matched` will contain "\u{dc00}a" instead of "\u{dc00}". ``` From c15fd7d0b11b60c9145bf3610212c90662892c37 Mon Sep 17 00:00:00 2001 From: kennytm Date: Tue, 13 Mar 2018 02:45:30 +0800 Subject: [PATCH 4/7] Mention the name OMG-WTF-8. --- text/0000-os-str-pattern.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md index d430545bdb3..775df5dfae0 100644 --- a/text/0000-os-str-pattern.md +++ b/text/0000-os-str-pattern.md @@ -229,7 +229,7 @@ assert_eq!(parts.next(), None); It is trivial to apply the pattern API to `OsStr` on platforms where it is just an `[u8]`. The main difficulty is on Windows where it is an `[u16]` encoded as WTF-8. This RFC thus focuses on Windows. -We will generalize the encoding of `OsStr` to specify these two capabilities: +We will generalize the encoding of `OsStr` to โ€œ[OMG-WTF-8]โ€ which specifies these two capabilities: 1. Slicing a surrogate pair in half: @@ -248,7 +248,9 @@ We will generalize the encoding of `OsStr` to specify these two capabilities: ``` These allow us to implement the โ€œPattern 1.5โ€ API for all `OsStr` without panicking. Implementation -detail can be found in the [`omgwtf8` package](https://github.com/kennytm/omgwtf8). +detail can be found in the [`omgwtf8` package][OMG-WTF-8]. + +[OMG-WTF-8]: https://github.com/kennytm/omgwtf8 ## Slicing From 3d481486c7fa67e17ea5d69b0aed2d167403daf7 Mon Sep 17 00:00:00 2001 From: kennytm Date: Tue, 13 Mar 2018 02:45:42 +0800 Subject: [PATCH 5/7] Expand more about the Pattern API. --- text/0000-os-str-pattern.md | 40 +++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md index 775df5dfae0..3ec200ff8a6 100644 --- a/text/0000-os-str-pattern.md +++ b/text/0000-os-str-pattern.md @@ -350,12 +350,24 @@ should move backward by 2 bytes before continuing. This ensure searching for `\u ## Pattern API -This RFC assumes a generalized pattern API which supports more than strings. If the pattern API is -not available, the new functions can take `&OsStr` instead of `impl Pattern<&OsStr>`, but this may -hurt future compatibility due to inference breakage. +As of Rust 1.25, we can search a `&str` using a character, a character set or another string, +powered by [RFC #528](https://github.com/rust-lang/rfcs/pull/528) a.k.a. โ€œPattern API 1.0โ€. -Assuming we do want to generalize the Pattern API, the implementor should note the issue of -splitting a surrogate pair: +There are some drafts to generalize this so that we could retain mutability and search in more types +such as `&[T]` and `&OsStr`, as described in various comments +(โ€œ[v1.5](https://github.com/rust-lang/rust/issues/27721#issuecomment-185405392)โ€ and +โ€œ[v2.0](https://github.com/rust-lang/rfcs/pull/1309#issuecomment-214030263)โ€). A proper RFC has not +been proposed so far. + +This RFC assumes the target of generalizing the Pattern API beyond `&str` is accepted, enabling us +to provide a uniform search API between different types of haystack and needles. However, this RFC +does not rely on a generalized Pattern API. If this RFC is stabilized without a generalized Pattern +API, the new methods described in the [Guide-level explanation][guide-level-explanation] section can +take `&OsStr` instead of `impl Pattern<&OsStr>`, but this may hurt future compatibility due to +inference breakage if generalized Pattern API is indeed implemented. + +Assuming we do want to generalize Pattern API, the implementor should note the issue of splitting a +surrogate pair: 1. A match which starts with a low surrogate will point to byte 1 of the 4-byte sequence 2. An index always point to byte 2 of the 4-byte sequence @@ -363,9 +375,14 @@ splitting a surrogate pair: Implementation should note these different offsets when converting between different kinds of cursors. In the [`omgwtf8::pattern` module](https://docs.rs/omgwtf8/*/omgwtf8/pattern/index.html), -this behavior is enforced by using distinct types for the start and end cursors. +based on the โ€œv1.5โ€ draft, this behavior is enforced in the API design by using distinct types for +the start and end cursors. + +The following outlines the generalized Pattern API which could work for `&OsStr`: ```rust +// in module `core::pattern`: + pub trait Pattern: Sized { type Searcher: Searcher; fn into_searcher(self, haystack: H) -> Self::Searcher; @@ -380,6 +397,13 @@ pub trait Searcher { fn next_reject(&mut self) -> Option<(H::StartCursor, H::EndCursor)>; } +pub trait ReverseSearcher: Searcher { + fn next_match_back(&mut self) -> Option<(H::StartCursor, H::EndCursor)>; + fn next_reject_back(&mut self) -> Option<(H::StartCursor, H::EndCursor)>; +} + +pub trait DoubleEndedSearcher: ReverseSearcher {} + // equivalent to SearchPtrs in "Pattern API 1.5" // and PatternHaystack in "Pattern API 2.0" pub trait Haystack: Sized { @@ -403,7 +427,7 @@ pub trait Haystack: Sized { } ``` -For `&OsStr`, we define both `StartCursor` and `EndCursor` as `*const u8`. +For the `&OsStr` haystack, we define both `StartCursor` and `EndCursor` as `*const u8`. The `start_to_end_cursor` function will return `cur + 2` if we find that `cur` points to the middle of a 4-byte sequence. @@ -452,7 +476,7 @@ match self.matcher.next_match() { ``` As a workaround, we introduced `find_range` and `match_ranges`. Note that this is already a - problem to solve if we want to make `Regex` a pattern. + problem to solve if we want to make `Regex` a pattern of strings. # Rationale and alternatives [alternatives]: #alternatives From bcf499fe2c7ea342e3adccd951cd9720d6b4469e Mon Sep 17 00:00:00 2001 From: kennytm Date: Fri, 30 Mar 2018 17:21:51 +0800 Subject: [PATCH 6/7] Bound the cursor by PartialOrd instead of using `is_range_empty()`. --- text/0000-os-str-pattern.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/text/0000-os-str-pattern.md b/text/0000-os-str-pattern.md index 3ec200ff8a6..493de0a654a 100644 --- a/text/0000-os-str-pattern.md +++ b/text/0000-os-str-pattern.md @@ -407,8 +407,8 @@ pub trait DoubleEndedSearcher: ReverseSearcher {} // equivalent to SearchPtrs in "Pattern API 1.5" // and PatternHaystack in "Pattern API 2.0" pub trait Haystack: Sized { - type StartCursor: Copy; - type EndCursor: Copy; + type StartCursor: Copy + PartialOrd; + type EndCursor: Copy + PartialOrd; // The following 5 methods are same as those in "Pattern API 1.5" // except the cursor type is split into two. @@ -418,9 +418,6 @@ pub trait Haystack: Sized { unsafe fn end_cursor_to_offset(hs: &Self, cur: Self::EndCursor) -> usize; unsafe fn range_to_self(hs: Self, start: Self::StartCursor, end: Self::EndCursor) -> Self; - // Since a StartCursor and EndCursor may not be comparable, we also need this method - fn is_range_empty(start: Self::StartCursor, end: Self::EndCursor) -> bool; - // And then we want to swap between the two cursor types unsafe fn start_to_end_cursor(hs: &Self, cur: Self::StartCursor) -> Self::EndCursor; unsafe fn end_to_start_cursor(hs: &Self, cur: Self::EndCursor) -> Self::StartCursor; From f708b56acf37b553f10326179e977e68d2e1757c Mon Sep 17 00:00:00 2001 From: Mazdak Farrokhzad Date: Mon, 9 Apr 2018 10:58:19 +0200 Subject: [PATCH 7/7] RFC 2295 --- text/{0000-os-str-pattern.md => 2295-os-str-pattern.md} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename text/{0000-os-str-pattern.md => 2295-os-str-pattern.md} (99%) diff --git a/text/0000-os-str-pattern.md b/text/2295-os-str-pattern.md similarity index 99% rename from text/0000-os-str-pattern.md rename to text/2295-os-str-pattern.md index 493de0a654a..aa830e462cd 100644 --- a/text/0000-os-str-pattern.md +++ b/text/2295-os-str-pattern.md @@ -1,7 +1,7 @@ - Feature Name: `os_str_pattern` - Start Date: 2018-01-16 -- RFC PR: (leave this empty) -- Rust Issue: (leave this empty) +- RFC PR: [rust-lang/rfcs#2295](https://github.com/rust-lang/rfcs/pull/2295) +- Rust Issue: [rust-lang/rust#49802](https://github.com/rust-lang/rust/issues/49802) # Summary [summary]: #summary