From 2bf0df777b7712d1b719cd5ac7cce63176b7384c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 01:02:19 -0600 Subject: [PATCH 01/14] Move rust memchr impl to libcore --- src/libcore/slice/memchr.rs | 224 ++++++++++++++++++++++++++++++ src/libcore/slice/mod.rs | 5 + src/libstd/lib.rs | 1 + src/libstd/sys/redox/memchr.rs | 2 +- src/libstd/sys/unix/memchr.rs | 2 +- src/libstd/sys/wasm/memchr.rs | 2 +- src/libstd/sys/windows/memchr.rs | 2 +- src/libstd/sys_common/memchr.rs | 227 ------------------------------- src/libstd/sys_common/mod.rs | 1 - 9 files changed, 234 insertions(+), 232 deletions(-) create mode 100644 src/libcore/slice/memchr.rs delete mode 100644 src/libstd/sys_common/memchr.rs diff --git a/src/libcore/slice/memchr.rs b/src/libcore/slice/memchr.rs new file mode 100644 index 0000000000000..252a258c30456 --- /dev/null +++ b/src/libcore/slice/memchr.rs @@ -0,0 +1,224 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// Original implementation taken from rust-memchr +// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch + +use cmp; +use mem; + +const LO_U64: u64 = 0x0101010101010101; +const HI_U64: u64 = 0x8080808080808080; + +// use truncation +const LO_USIZE: usize = LO_U64 as usize; +const HI_USIZE: usize = HI_U64 as usize; + +/// Return `true` if `x` contains any zero byte. +/// +/// From *Matters Computational*, J. Arndt +/// +/// "The idea is to subtract one from each of the bytes and then look for +/// bytes where the borrow propagated all the way to the most significant +/// bit." +#[inline] +fn contains_zero_byte(x: usize) -> bool { + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 +} + +#[cfg(target_pointer_width = "32")] +#[inline] +fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep +} + +#[cfg(target_pointer_width = "64")] +#[inline] +fn repeat_byte(b: u8) -> usize { + let mut rep = (b as usize) << 8 | b as usize; + rep = rep << 16 | rep; + rep = rep << 32 | rep; + rep +} + +/// Return the first index matching the byte `a` in `text`. +pub fn memchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned initial part, before the first word aligned address in text + // - body, scan by 2 words at a time + // - the last remaining part, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search up to an aligned boundary + let mut offset = ptr.align_offset(usize_bytes); + if offset > 0 { + offset = cmp::min(offset, len); + if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { + return Some(index); + } + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + if len >= 2 * usize_bytes { + while offset <= len - 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset += usize_bytes * 2; + } + } + + // find the byte after the point the body loop stopped + text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) +} + +/// Return the last index matching the byte `a` in `text`. +pub fn memrchr(x: u8, text: &[u8]) -> Option { + // Scan for a single byte value by reading two `usize` words at a time. + // + // Split `text` in three parts + // - unaligned tail, after the last word aligned address in text + // - body, scan by 2 words at a time + // - the first remaining bytes, < 2 word size + let len = text.len(); + let ptr = text.as_ptr(); + let usize_bytes = mem::size_of::(); + + // search to an aligned boundary + let end_align = (ptr as usize + len) & (usize_bytes - 1); + let mut offset; + if end_align > 0 { + offset = if end_align >= len { 0 } else { len - end_align }; + if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { + return Some(offset + index); + } + } else { + offset = len; + } + + // search the body of the text + let repeated_x = repeat_byte(x); + + while offset >= 2 * usize_bytes { + unsafe { + let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); + let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); + + // break if there is a matching byte + let zu = contains_zero_byte(u ^ repeated_x); + let zv = contains_zero_byte(v ^ repeated_x); + if zu || zv { + break; + } + } + offset -= 2 * usize_bytes; + } + + // find the byte before the point the body loop stopped + text[..offset].iter().rposition(|elt| *elt == x) +} + +// test fallback implementations on all platforms +#[test] +fn matches_one() { + assert_eq!(Some(0), memchr(b'a', b"a")); +} + +#[test] +fn matches_begin() { + assert_eq!(Some(0), memchr(b'a', b"aaaa")); +} + +#[test] +fn matches_end() { + assert_eq!(Some(4), memchr(b'z', b"aaaaz")); +} + +#[test] +fn matches_nul() { + assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); +} + +#[test] +fn matches_past_nul() { + assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); +} + +#[test] +fn no_match_empty() { + assert_eq!(None, memchr(b'a', b"")); +} + +#[test] +fn no_match() { + assert_eq!(None, memchr(b'a', b"xyz")); +} + +#[test] +fn matches_one_reversed() { + assert_eq!(Some(0), memrchr(b'a', b"a")); +} + +#[test] +fn matches_begin_reversed() { + assert_eq!(Some(3), memrchr(b'a', b"aaaa")); +} + +#[test] +fn matches_end_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); +} + +#[test] +fn matches_nul_reversed() { + assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); +} + +#[test] +fn matches_past_nul_reversed() { + assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); +} + +#[test] +fn no_match_empty_reversed() { + assert_eq!(None, memrchr(b'a', b"")); +} + +#[test] +fn no_match_reversed() { + assert_eq!(None, memrchr(b'a', b"xyz")); +} + +#[test] +fn each_alignment_reversed() { + let mut data = [1u8; 64]; + let needle = 2; + let pos = 40; + data[pos] = needle; + for start in 0..16 { + assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); + } +} diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index 49c51f4f04fdc..e4da1b7e5f5d8 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -50,6 +50,11 @@ use mem; use marker::{Copy, Send, Sync, Sized, self}; use iter_private::TrustedRandomAccess; +#[unstable(feature = "slice_internals", issue = "0", + reason = "exposed from core to be reused in std; use the memchr crate")] +/// Pure rust memchr implementation, taken from rust-memchr +pub mod memchr; + mod rotate; mod sort; diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 12e6231136e16..536757336cd8b 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -302,6 +302,7 @@ #![feature(sip_hash_13)] #![feature(slice_bytes)] #![feature(slice_concat_ext)] +#![feature(slice_internals)] #![feature(slice_patterns)] #![feature(staged_api)] #![feature(stmt_expr_attributes)] diff --git a/src/libstd/sys/redox/memchr.rs b/src/libstd/sys/redox/memchr.rs index 4c314b7a47258..873b33535025b 100644 --- a/src/libstd/sys/redox/memchr.rs +++ b/src/libstd/sys/redox/memchr.rs @@ -11,4 +11,4 @@ // Original implementation taken from rust-memchr // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys/unix/memchr.rs b/src/libstd/sys/unix/memchr.rs index aed04703ea117..f49adc24163ca 100644 --- a/src/libstd/sys/unix/memchr.rs +++ b/src/libstd/sys/unix/memchr.rs @@ -50,7 +50,7 @@ pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { #[cfg(not(target_os = "linux"))] fn memrchr_specific(needle: u8, haystack: &[u8]) -> Option { - ::sys_common::memchr::fallback::memrchr(needle, haystack) + ::core::slice::memchr::memrchr(needle, haystack) } memrchr_specific(needle, haystack) diff --git a/src/libstd/sys/wasm/memchr.rs b/src/libstd/sys/wasm/memchr.rs index e611d94af30b1..964e35994139b 100644 --- a/src/libstd/sys/wasm/memchr.rs +++ b/src/libstd/sys/wasm/memchr.rs @@ -8,4 +8,4 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys/windows/memchr.rs b/src/libstd/sys/windows/memchr.rs index 5a5386acaa531..fa7c816fd02ea 100644 --- a/src/libstd/sys/windows/memchr.rs +++ b/src/libstd/sys/windows/memchr.rs @@ -12,4 +12,4 @@ // Copyright 2015 Andrew Gallant, bluss and Nicolas Koch // Fallback memchr is fastest on windows -pub use sys_common::memchr::fallback::{memchr, memrchr}; +pub use core::slice::memchr::{memchr, memrchr}; diff --git a/src/libstd/sys_common/memchr.rs b/src/libstd/sys_common/memchr.rs deleted file mode 100644 index 50f998eb4867d..0000000000000 --- a/src/libstd/sys_common/memchr.rs +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright 2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. -// -// Original implementation taken from rust-memchr -// Copyright 2015 Andrew Gallant, bluss and Nicolas Koch - -#[allow(dead_code)] -pub mod fallback { - use cmp; - use mem; - - const LO_U64: u64 = 0x0101010101010101; - const HI_U64: u64 = 0x8080808080808080; - - // use truncation - const LO_USIZE: usize = LO_U64 as usize; - const HI_USIZE: usize = HI_U64 as usize; - - /// Return `true` if `x` contains any zero byte. - /// - /// From *Matters Computational*, J. Arndt - /// - /// "The idea is to subtract one from each of the bytes and then look for - /// bytes where the borrow propagated all the way to the most significant - /// bit." - #[inline] - fn contains_zero_byte(x: usize) -> bool { - x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 - } - - #[cfg(target_pointer_width = "32")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep - } - - #[cfg(target_pointer_width = "64")] - #[inline] - fn repeat_byte(b: u8) -> usize { - let mut rep = (b as usize) << 8 | b as usize; - rep = rep << 16 | rep; - rep = rep << 32 | rep; - rep - } - - /// Return the first index matching the byte `a` in `text`. - pub fn memchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned initial part, before the first word aligned address in text - // - body, scan by 2 words at a time - // - the last remaining part, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search up to an aligned boundary - let mut offset = ptr.align_offset(usize_bytes); - if offset > 0 { - offset = cmp::min(offset, len); - if let Some(index) = text[..offset].iter().position(|elt| *elt == x) { - return Some(index); - } - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - if len >= 2 * usize_bytes { - while offset <= len - 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize) as *const usize); - let v = *(ptr.offset((offset + usize_bytes) as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset += usize_bytes * 2; - } - } - - // find the byte after the point the body loop stopped - text[offset..].iter().position(|elt| *elt == x).map(|i| offset + i) - } - - /// Return the last index matching the byte `a` in `text`. - pub fn memrchr(x: u8, text: &[u8]) -> Option { - // Scan for a single byte value by reading two `usize` words at a time. - // - // Split `text` in three parts - // - unaligned tail, after the last word aligned address in text - // - body, scan by 2 words at a time - // - the first remaining bytes, < 2 word size - let len = text.len(); - let ptr = text.as_ptr(); - let usize_bytes = mem::size_of::(); - - // search to an aligned boundary - let end_align = (ptr as usize + len) & (usize_bytes - 1); - let mut offset; - if end_align > 0 { - offset = if end_align >= len { 0 } else { len - end_align }; - if let Some(index) = text[offset..].iter().rposition(|elt| *elt == x) { - return Some(offset + index); - } - } else { - offset = len; - } - - // search the body of the text - let repeated_x = repeat_byte(x); - - while offset >= 2 * usize_bytes { - unsafe { - let u = *(ptr.offset(offset as isize - 2 * usize_bytes as isize) as *const usize); - let v = *(ptr.offset(offset as isize - usize_bytes as isize) as *const usize); - - // break if there is a matching byte - let zu = contains_zero_byte(u ^ repeated_x); - let zv = contains_zero_byte(v ^ repeated_x); - if zu || zv { - break; - } - } - offset -= 2 * usize_bytes; - } - - // find the byte before the point the body loop stopped - text[..offset].iter().rposition(|elt| *elt == x) - } - - // test fallback implementations on all platforms - #[test] - fn matches_one() { - assert_eq!(Some(0), memchr(b'a', b"a")); - } - - #[test] - fn matches_begin() { - assert_eq!(Some(0), memchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end() { - assert_eq!(Some(4), memchr(b'z', b"aaaaz")); - } - - #[test] - fn matches_nul() { - assert_eq!(Some(4), memchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul() { - assert_eq!(Some(5), memchr(b'z', b"aaaa\x00z")); - } - - #[test] - fn no_match_empty() { - assert_eq!(None, memchr(b'a', b"")); - } - - #[test] - fn no_match() { - assert_eq!(None, memchr(b'a', b"xyz")); - } - - #[test] - fn matches_one_reversed() { - assert_eq!(Some(0), memrchr(b'a', b"a")); - } - - #[test] - fn matches_begin_reversed() { - assert_eq!(Some(3), memrchr(b'a', b"aaaa")); - } - - #[test] - fn matches_end_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"zaaaa")); - } - - #[test] - fn matches_nul_reversed() { - assert_eq!(Some(4), memrchr(b'\x00', b"aaaa\x00")); - } - - #[test] - fn matches_past_nul_reversed() { - assert_eq!(Some(0), memrchr(b'z', b"z\x00aaaa")); - } - - #[test] - fn no_match_empty_reversed() { - assert_eq!(None, memrchr(b'a', b"")); - } - - #[test] - fn no_match_reversed() { - assert_eq!(None, memrchr(b'a', b"xyz")); - } - - #[test] - fn each_alignment_reversed() { - let mut data = [1u8; 64]; - let needle = 2; - let pos = 40; - data[pos] = needle; - for start in 0..16 { - assert_eq!(Some(pos - start), memrchr(needle, &data[start..])); - } - } -} diff --git a/src/libstd/sys_common/mod.rs b/src/libstd/sys_common/mod.rs index 14e5697b94e57..534fcf4d11bbb 100644 --- a/src/libstd/sys_common/mod.rs +++ b/src/libstd/sys_common/mod.rs @@ -33,7 +33,6 @@ pub mod at_exit_imp; pub mod backtrace; pub mod condvar; pub mod io; -pub mod memchr; pub mod mutex; pub mod poison; pub mod remutex; From f8f28886e0d98c9cbd6cb3a719f9014960ec1d24 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 09:11:42 -0600 Subject: [PATCH 02/14] Use memchr in [u8]::contains --- src/libcore/slice/mod.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index e4da1b7e5f5d8..346ee27331121 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -624,7 +624,7 @@ impl SliceExt for [T] { #[inline] fn contains(&self, x: &T) -> bool where T: PartialEq { - self.iter().any(|elt| *x == *elt) + x.slice_contains(self) } #[inline] @@ -2619,3 +2619,19 @@ unsafe impl<'a, T> TrustedRandomAccess for IterMut<'a, T> { } fn may_have_side_effect() -> bool { false } } + +trait SliceContains: Sized { + fn slice_contains(&self, x: &[Self]) -> bool; +} + +impl SliceContains for T where T: PartialEq { + default fn slice_contains(&self, x: &[Self]) -> bool { + x.iter().any(|y| *y == *self) + } +} + +impl SliceContains for u8 { + fn slice_contains(&self, x: &[Self]) -> bool { + memchr::memchr(*self, x).is_some() + } +} From 1d818a4d8c3fa5b15ad2e2ab30531316565d556c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 10:40:11 -0600 Subject: [PATCH 03/14] Support 16 bit platforms --- src/libcore/slice/memchr.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libcore/slice/memchr.rs b/src/libcore/slice/memchr.rs index 252a258c30456..00183be97e751 100644 --- a/src/libcore/slice/memchr.rs +++ b/src/libcore/slice/memchr.rs @@ -33,6 +33,12 @@ fn contains_zero_byte(x: usize) -> bool { x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 } +#[cfg(target_pointer_width = "16")] +#[inline] +fn repeat_byte(b: u8) -> usize { + (b as usize) << 8 | b as usize +} + #[cfg(target_pointer_width = "32")] #[inline] fn repeat_byte(b: u8) -> usize { From 4550ea79f004215af1490e2c269a16d46b890b9f Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 13:11:48 -0600 Subject: [PATCH 04/14] Remove the unused ascii_only field in CharEqSearcher --- src/libcore/str/pattern.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index edb7bed4520fb..3200cfc498236 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -241,23 +241,16 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[doc(hidden)] trait CharEq { fn matches(&mut self, c: char) -> bool; - fn only_ascii(&self) -> bool; } impl CharEq for char { #[inline] fn matches(&mut self, c: char) -> bool { *self == c } - - #[inline] - fn only_ascii(&self) -> bool { (*self as u32) < 128 } } impl CharEq for F where F: FnMut(char) -> bool { #[inline] fn matches(&mut self, c: char) -> bool { (*self)(c) } - - #[inline] - fn only_ascii(&self) -> bool { false } } impl<'a> CharEq for &'a [char] { @@ -265,11 +258,6 @@ impl<'a> CharEq for &'a [char] { fn matches(&mut self, c: char) -> bool { self.iter().any(|&m| { let mut m = m; m.matches(c) }) } - - #[inline] - fn only_ascii(&self) -> bool { - self.iter().all(|m| m.only_ascii()) - } } struct CharEqPattern(C); @@ -279,8 +267,6 @@ struct CharEqSearcher<'a, C: CharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, - #[allow(dead_code)] - ascii_only: bool, } impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { @@ -289,7 +275,6 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { #[inline] fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { CharEqSearcher { - ascii_only: self.0.only_ascii(), haystack, char_eq: self.0, char_indices: haystack.char_indices(), @@ -499,7 +484,6 @@ impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> f.debug_struct("CharPredicateSearcher") .field("haystack", &self.0.haystack) .field("char_indices", &self.0.char_indices) - .field("ascii_only", &self.0.ascii_only) .finish() } } From 72cab5e3263343502aeb1f21a8a17c7f7e917a50 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 14:36:49 -0600 Subject: [PATCH 05/14] Split out char searcher from MultiCharSearcher --- src/libcore/str/pattern.rs | 87 +++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 29 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 3200cfc498236..9dc828518278e 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -235,46 +235,41 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} ///////////////////////////////////////////////////////////////////////////// -// Impl for a CharEq wrapper +// Impl for a MultiCharEq wrapper ///////////////////////////////////////////////////////////////////////////// #[doc(hidden)] -trait CharEq { +trait MultiCharEq { fn matches(&mut self, c: char) -> bool; } -impl CharEq for char { - #[inline] - fn matches(&mut self, c: char) -> bool { *self == c } -} - -impl CharEq for F where F: FnMut(char) -> bool { +impl MultiCharEq for F where F: FnMut(char) -> bool { #[inline] fn matches(&mut self, c: char) -> bool { (*self)(c) } } -impl<'a> CharEq for &'a [char] { +impl<'a> MultiCharEq for &'a [char] { #[inline] fn matches(&mut self, c: char) -> bool { - self.iter().any(|&m| { let mut m = m; m.matches(c) }) + self.iter().any(|&m| { m == c }) } } -struct CharEqPattern(C); +struct MultiCharEqPattern(C); #[derive(Clone, Debug)] -struct CharEqSearcher<'a, C: CharEq> { +struct MultiCharEqSearcher<'a, C: MultiCharEq> { char_eq: C, haystack: &'a str, char_indices: super::CharIndices<'a>, } -impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { - type Searcher = CharEqSearcher<'a, C>; +impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern { + type Searcher = MultiCharEqSearcher<'a, C>; #[inline] - fn into_searcher(self, haystack: &'a str) -> CharEqSearcher<'a, C> { - CharEqSearcher { + fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> { + MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices(), @@ -282,7 +277,7 @@ impl<'a, C: CharEq> Pattern<'a> for CharEqPattern { } } -unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn haystack(&self) -> &'a str { self.haystack @@ -307,7 +302,7 @@ unsafe impl<'a, C: CharEq> Searcher<'a> for CharEqSearcher<'a, C> { } } -unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { +unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> { #[inline] fn next_back(&mut self) -> SearchStep { let s = &mut self.char_indices; @@ -327,7 +322,7 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { } } -impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} +impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {} ///////////////////////////////////////////////////////////////////////////// @@ -400,14 +395,40 @@ macro_rules! searcher_methods { /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a>( as Pattern<'a>>::Searcher); +pub struct CharSearcher<'a>(&'a str); unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - searcher_methods!(forward); + #[inline] + fn haystack(&self) -> &'a str { + unimplemented!(); + } + #[inline] + fn next(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } } unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - searcher_methods!(reverse); + #[inline] + fn next_back(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } } impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} @@ -418,7 +439,7 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(CharEqPattern(self).into_searcher(haystack)) + CharSearcher(haystack) } #[inline] @@ -433,13 +454,21 @@ impl<'a> Pattern<'a> for char { #[inline] fn is_prefix_of(self, haystack: &'a str) -> bool { - CharEqPattern(self).is_prefix_of(haystack) + if let Some(ch) = haystack.chars().next() { + self == ch + } else { + false + } } #[inline] fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> { - CharEqPattern(self).is_suffix_of(haystack) + if let Some(ch) = haystack.chars().next_back() { + self == ch + } else { + false + } } } @@ -451,7 +480,7 @@ impl<'a> Pattern<'a> for char { /// Associated type for `<&[char] as Pattern<'a>>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); +pub struct CharSliceSearcher<'a, 'b>( as Pattern<'a>>::Searcher); unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> { searcher_methods!(forward); @@ -465,7 +494,7 @@ impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {} /// Searches for chars that are equal to any of the chars in the array impl<'a, 'b> Pattern<'a> for &'b [char] { - pattern_methods!(CharSliceSearcher<'a, 'b>, CharEqPattern, CharSliceSearcher); + pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher); } ///////////////////////////////////////////////////////////////////////////// @@ -474,7 +503,7 @@ impl<'a, 'b> Pattern<'a> for &'b [char] { /// Associated type for `>::Searcher`. #[derive(Clone)] -pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) +pub struct CharPredicateSearcher<'a, F>( as Pattern<'a>>::Searcher) where F: FnMut(char) -> bool; impl<'a, F> fmt::Debug for CharPredicateSearcher<'a, F> @@ -504,7 +533,7 @@ impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> /// Searches for chars that match the given predicate impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { - pattern_methods!(CharPredicateSearcher<'a, F>, CharEqPattern, CharPredicateSearcher); + pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher); } ///////////////////////////////////////////////////////////////////////////// From 585ad9ff30e579e929bca2b1221367cc440aa377 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 14:37:35 -0600 Subject: [PATCH 06/14] Move CharSearcher to its own section in the file --- src/libcore/str/pattern.rs | 167 +++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 9dc828518278e..b1b66c9f8d8b3 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -234,6 +234,90 @@ pub unsafe trait ReverseSearcher<'a>: Searcher<'a> { /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched. pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} + +///////////////////////////////////////////////////////////////////////////// +// Impl for char +///////////////////////////////////////////////////////////////////////////// + +/// Associated type for `>::Searcher`. +#[derive(Clone, Debug)] +pub struct CharSearcher<'a>(&'a str); + +unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { + #[inline] + fn haystack(&self) -> &'a str { + unimplemented!(); + } + #[inline] + fn next(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } +} + +unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { + #[inline] + fn next_back(&mut self) -> SearchStep { + unimplemented!(); + } + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } + #[inline] + fn next_reject_back(&mut self) -> Option<(usize, usize)> { + unimplemented!(); + } +} + +impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} + +/// Searches for chars that are equal to a given char +impl<'a> Pattern<'a> for char { + type Searcher = CharSearcher<'a>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> Self::Searcher { + CharSearcher(haystack) + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + if (self as u32) < 128 { + haystack.as_bytes().contains(&(self as u8)) + } else { + let mut buffer = [0u8; 4]; + self.encode_utf8(&mut buffer).is_contained_in(haystack) + } + } + + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + if let Some(ch) = haystack.chars().next() { + self == ch + } else { + false + } + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> + { + if let Some(ch) = haystack.chars().next_back() { + self == ch + } else { + false + } + } +} + ///////////////////////////////////////////////////////////////////////////// // Impl for a MultiCharEq wrapper ///////////////////////////////////////////////////////////////////////////// @@ -389,89 +473,6 @@ macro_rules! searcher_methods { } } -///////////////////////////////////////////////////////////////////////////// -// Impl for char -///////////////////////////////////////////////////////////////////////////// - -/// Associated type for `>::Searcher`. -#[derive(Clone, Debug)] -pub struct CharSearcher<'a>(&'a str); - -unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { - #[inline] - fn haystack(&self) -> &'a str { - unimplemented!(); - } - #[inline] - fn next(&mut self) -> SearchStep { - unimplemented!(); - } - #[inline] - fn next_match(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } -} - -unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { - #[inline] - fn next_back(&mut self) -> SearchStep { - unimplemented!(); - } - #[inline] - fn next_match_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } -} - -impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} - -/// Searches for chars that are equal to a given char -impl<'a> Pattern<'a> for char { - type Searcher = CharSearcher<'a>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(haystack) - } - - #[inline] - fn is_contained_in(self, haystack: &'a str) -> bool { - if (self as u32) < 128 { - haystack.as_bytes().contains(&(self as u8)) - } else { - let mut buffer = [0u8; 4]; - self.encode_utf8(&mut buffer).is_contained_in(haystack) - } - } - - #[inline] - fn is_prefix_of(self, haystack: &'a str) -> bool { - if let Some(ch) = haystack.chars().next() { - self == ch - } else { - false - } - } - - #[inline] - fn is_suffix_of(self, haystack: &'a str) -> bool where Self::Searcher: ReverseSearcher<'a> - { - if let Some(ch) = haystack.chars().next_back() { - self == ch - } else { - false - } - } -} - ///////////////////////////////////////////////////////////////////////////// // Impl for &[char] ///////////////////////////////////////////////////////////////////////////// From d9dc44a5e9857864905e1cdbf40ab9ac617f65e7 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 13 Dec 2017 15:26:27 -0600 Subject: [PATCH 07/14] Fill in forward searcher impl for char --- src/libcore/str/pattern.rs | 78 +++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index b1b66c9f8d8b3..3f24374223cb6 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -19,6 +19,7 @@ use cmp; use fmt; +use slice::memchr; use usize; // Pattern @@ -241,25 +242,66 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} /// Associated type for `>::Searcher`. #[derive(Clone, Debug)] -pub struct CharSearcher<'a>(&'a str); +pub struct CharSearcher<'a> { + haystack: &'a str, + // invariant: `finger` must be a valid utf8 byte index of `haystack` + finger: usize, + needle: char, + // For ascii chars + // invariant: must be an ASCII byte (no high bit) + single_byte: Option, +} unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { #[inline] fn haystack(&self) -> &'a str { - unimplemented!(); + self.haystack } #[inline] fn next(&mut self) -> SearchStep { - unimplemented!(); + let old_finger = self.finger; + let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next() { + // add byte offset of current character + // without recalculating + self.finger += iter.iter.len() - old_len; + if ch == self.needle { + SearchStep::Match(old_finger, self.finger) + } else { + SearchStep::Reject(old_finger, self.finger) + } + } else { + SearchStep::Done + } } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } - #[inline] - fn next_reject(&mut self) -> Option<(usize, usize)> { - unimplemented!(); + if let Some(byte) = self.single_byte { + let old_finger = self.finger; + let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let bytes = slice.as_bytes(); + if let Some(index) = memchr::memchr(byte, bytes) { + // index is the index of a valid ASCII byte, + // so we can add one to it + self.finger += index + 1; + Some((index, self.finger)) + } else { + None + } + } else { + loop { + match self.next() { + SearchStep::Match(a, b) => break Some((a, b)), + SearchStep::Done => break None, + _ => continue, + } + } + } } + + // let next_reject use the default implementation from the Searcher trait } unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { @@ -271,10 +313,8 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { fn next_match_back(&mut self) -> Option<(usize, usize)> { unimplemented!(); } - #[inline] - fn next_reject_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); - } + + // let next_reject_back use the default implementation from the Searcher trait } impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {} @@ -285,7 +325,19 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - CharSearcher(haystack) + let single_byte = if self.len_utf8() == 1 { + let mut storage = [0]; + self.encode_utf8(&mut storage); + Some(storage[0]) + } else { + None + }; + CharSearcher { + haystack, + finger: 0, + needle: self, + single_byte, + } } #[inline] From f865164030ccd167a9e9f9fae665373fb58295fb Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Thu, 14 Dec 2017 14:10:10 -0600 Subject: [PATCH 08/14] Fill in reverse searcher impl for char --- src/libcore/str/pattern.rs | 56 ++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 3f24374223cb6..54e426893bc7e 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -128,6 +128,11 @@ pub unsafe trait Searcher<'a> { fn next(&mut self) -> SearchStep; /// Find the next `Match` result. See `next()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_reject will overlap. This will return (start_match, end_match), + /// where start_match is the index of where the match begins, and end_match is + /// the index after the end of the match. #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { loop { @@ -139,7 +144,10 @@ pub unsafe trait Searcher<'a> { } } - /// Find the next `Reject` result. See `next()` + /// Find the next `Reject` result. See `next()` and `next_match()` + /// + /// Unlike next(), there is no guarantee that the returned ranges + /// of this and next_match will overlap. #[inline] fn next_reject(&mut self) -> Option<(usize, usize)> { loop { @@ -244,8 +252,9 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[derive(Clone, Debug)] pub struct CharSearcher<'a> { haystack: &'a str, - // invariant: `finger` must be a valid utf8 byte index of `haystack` + // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` finger: usize, + finger_back: usize, needle: char, // For ascii chars // invariant: must be an ASCII byte (no high bit) @@ -266,7 +275,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { if let Some(ch) = iter.next() { // add byte offset of current character // without recalculating - self.finger += iter.iter.len() - old_len; + self.finger += old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(old_finger, self.finger) } else { @@ -286,7 +295,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // index is the index of a valid ASCII byte, // so we can add one to it self.finger += index + 1; - Some((index, self.finger)) + Some((self.finger - 1, self.finger)) } else { None } @@ -307,11 +316,45 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { #[inline] fn next_back(&mut self) -> SearchStep { - unimplemented!(); + let old_finger = self.finger_back; + let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; + let mut iter = slice.chars(); + let old_len = iter.iter.len(); + if let Some(ch) = iter.next_back() { + // subtract byte offset of current character + // without recalculating + self.finger_back -= old_len - iter.iter.len(); + if ch == self.needle { + SearchStep::Match(self.finger_back, old_finger) + } else { + SearchStep::Reject(self.finger_back, old_finger) + } + } else { + SearchStep::Done + } } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - unimplemented!(); + if let Some(byte) = self.single_byte { + let old_finger = self.finger_back; + let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; + let bytes = slice.as_bytes(); + if let Some(index) = memchr::memrchr(byte, bytes) { + // index is the index of a valid ASCII byte + self.finger_back = index; + Some((self.finger_back, self.finger_back + 1)) + } else { + None + } + } else { + loop { + match self.next_back() { + SearchStep::Match(a, b) => break Some((a, b)), + SearchStep::Done => break None, + _ => continue, + } + } + } } // let next_reject_back use the default implementation from the Searcher trait @@ -335,6 +378,7 @@ impl<'a> Pattern<'a> for char { CharSearcher { haystack, finger: 0, + finger_back: haystack.len(), needle: self, single_byte, } From 75c07a37ff352607523a3c7a4e8bc3809949cb4c Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sat, 16 Dec 2017 22:17:27 -0600 Subject: [PATCH 09/14] Add memchr search support for multibyte characters --- src/libcore/str/pattern.rs | 150 +++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 48 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 54e426893bc7e..e44799bb9c5ab 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -252,13 +252,28 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {} #[derive(Clone, Debug)] pub struct CharSearcher<'a> { haystack: &'a str, - // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack` + // This invariant can be broken *within* next_match and next_match_back, however + // they must exit with fingers on valid code point boundaries. + + /// `finger` is the current byte index of the forward search. + /// Imagine that it exists before the byte at its index, i.e. + /// haystack[finger] is the first byte of the slice we must inspect during + /// forward searching finger: usize, + /// `finger_back` is the current byte index of the reverse search. + /// Imagine that it exists after the byte at its index, i.e. + /// haystack[finger_back - 1] is the last byte of the slice we must inspect during + /// forward searching (and thus the first byte to be inspected when calling next_back()) finger_back: usize, + /// The character being searched for needle: char, - // For ascii chars - // invariant: must be an ASCII byte (no high bit) - single_byte: Option, + + // safety invariant: `utf8_size` must be less than 5 + /// The number of bytes `needle` takes up when encoded in utf8 + utf8_size: usize, + /// A utf8 encoded copy of the `needle` + utf8_encoded: [u8; 4], } unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { @@ -269,12 +284,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { #[inline] fn next(&mut self) -> SearchStep { let old_finger = self.finger; - let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; + let slice = unsafe { self.haystack.get_unchecked(old_finger..self.haystack.len()) }; let mut iter = slice.chars(); let old_len = iter.iter.len(); if let Some(ch) = iter.next() { // add byte offset of current character - // without recalculating + // without re-encoding as utf-8 self.finger += old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(old_finger, self.finger) @@ -287,25 +302,44 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { } #[inline] fn next_match(&mut self) -> Option<(usize, usize)> { - if let Some(byte) = self.single_byte { - let old_finger = self.finger; - let slice = unsafe { self.haystack.get_unchecked(old_finger..) }; - let bytes = slice.as_bytes(); - if let Some(index) = memchr::memchr(byte, bytes) { - // index is the index of a valid ASCII byte, - // so we can add one to it - self.finger += index + 1; - Some((self.finger - 1, self.finger)) + loop { + // get the haystack after the last character found + let bytes = if let Some(slice) = self.haystack.as_bytes().get(self.finger..) { + slice } else { - None - } - } else { - loop { - match self.next() { - SearchStep::Match(a, b) => break Some((a, b)), - SearchStep::Done => break None, - _ => continue, + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memchr(last_byte, bytes) { + // The new finger is the index of the byte we found, + // plus one, since we memchr'd for the last byte of the character. + // + // Note that this doesn't always give us a finger on a UTF8 boundary. + // If we *didn't* find our character + // we may have indexed to the non-last byte of a 3-byte or 4-byte character. + // We can't just skip to the next valid starting byte because a character like + // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find + // the second byte when searching for the third. + // + // However, this is totally okay. While we have the invariant that + // self.finger is on a UTF8 boundary, this invariant is not relid upon + // within this method (it is relied upon in CharSearcher::next()). + // + // We only exit this method when we reach the end of the string, or if we + // find something. When we find something the `finger` will be set + // to a UTF8 boundary. + self.finger += index + 1; + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } } + } else { + // found nothing, exit + self.finger = self.haystack.len(); + return None; } } } @@ -322,7 +356,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { let old_len = iter.iter.len(); if let Some(ch) = iter.next_back() { // subtract byte offset of current character - // without recalculating + // without re-encoding as utf-8 self.finger_back -= old_len - iter.iter.len(); if ch == self.needle { SearchStep::Match(self.finger_back, old_finger) @@ -335,24 +369,47 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { } #[inline] fn next_match_back(&mut self) -> Option<(usize, usize)> { - if let Some(byte) = self.single_byte { - let old_finger = self.finger_back; - let slice = unsafe { self.haystack.slice_unchecked(0, old_finger) }; - let bytes = slice.as_bytes(); - if let Some(index) = memchr::memrchr(byte, bytes) { - // index is the index of a valid ASCII byte - self.finger_back = index; - Some((self.finger_back, self.finger_back + 1)) + let haystack = self.haystack.as_bytes(); + loop { + // get the haystack up to but not including the last character searched + let bytes = if let Some(slice) = haystack.get(..self.finger_back) { + slice } else { - None - } - } else { - loop { - match self.next_back() { - SearchStep::Match(a, b) => break Some((a, b)), - SearchStep::Done => break None, - _ => continue, + return None; + }; + // the last byte of the utf8 encoded needle + let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) }; + if let Some(index) = memchr::memrchr(last_byte, bytes) { + // memrchr will return the index of the byte we wish to + // find. In case of an ASCII character, this is indeed + // were we wish our new finger to be ("after" the found + // char in the paradigm of reverse iteration). For + // multibyte chars we need to skip down by the number of more + // bytes they have than ASCII + let found_char = index - (self.utf8_size - 1); + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e. at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } } + // We can't use finger_back = index - size + 1 here. If we found the last char + // of a different-sized character (or the middle byte of a different character) + // we need to bump the finger_back down to `index`. This similarly makes + // `finger_back` have the potential to no longer be on a boundary, + // but this is OK since we only exit this function on a boundary + // or when the haystack has been searched completely. + // + // Unlike next_match this does not + // have the problem of repeated bytes in utf-8 because + // we're searching for the last byte, and we can only have + // found the last byte when searching in reverse. + self.finger_back = index; + } else { + self.finger_back = 0; + // found nothing, exit + return None; } } } @@ -368,19 +425,16 @@ impl<'a> Pattern<'a> for char { #[inline] fn into_searcher(self, haystack: &'a str) -> Self::Searcher { - let single_byte = if self.len_utf8() == 1 { - let mut storage = [0]; - self.encode_utf8(&mut storage); - Some(storage[0]) - } else { - None - }; + let mut utf8_encoded = [0; 4]; + self.encode_utf8(&mut utf8_encoded); + let utf8_size = self.len_utf8(); CharSearcher { haystack, finger: 0, finger_back: haystack.len(), needle: self, - single_byte, + utf8_size, + utf8_encoded } } From efcc447ebfafde91eba51ae04cdb8b0b776f8ac8 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sun, 17 Dec 2017 14:44:03 -0800 Subject: [PATCH 10/14] Add simple test for pattern API --- src/libcore/tests/lib.rs | 2 + src/libcore/tests/pattern.rs | 76 ++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 src/libcore/tests/pattern.rs diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs index 0e445cdac358a..c4b85b829812c 100644 --- a/src/libcore/tests/lib.rs +++ b/src/libcore/tests/lib.rs @@ -28,6 +28,7 @@ #![feature(iter_rfind)] #![feature(iter_rfold)] #![feature(nonzero)] +#![feature(pattern)] #![feature(raw)] #![feature(refcell_replace_swap)] #![feature(sip_hash_13)] @@ -61,6 +62,7 @@ mod nonzero; mod num; mod ops; mod option; +mod pattern; mod ptr; mod result; mod slice; diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs new file mode 100644 index 0000000000000..e12f0bc9e5f6d --- /dev/null +++ b/src/libcore/tests/pattern.rs @@ -0,0 +1,76 @@ +use std::str::pattern::*; + +// This macro makes it easier to write +// tests that do a series of iterations +macro_rules! search_asserts { + ($haystack:expr, $needle:expr, $testname:expr, [$($func:ident),*], $result:expr) => { + let mut searcher = $needle.into_searcher($haystack); + let arr = [$( Step::from(searcher.$func()) ),+]; + assert_eq!(&arr[..], &$result, $testname); + } +} + +/// Combined enum for the results of next() and next_match()/next_reject() +#[derive(Debug, PartialEq, Eq)] +enum Step { + // variant names purposely chosen to + // be the same length for easy alignment + Matches(usize, usize), + Rejects(usize, usize), + InRange(usize, usize), + Done +} + +use Step::*; + +impl From for Step { + fn from(x: SearchStep) -> Self { + match x { + SearchStep::Match(a, b) => Matches(a, b), + SearchStep::Reject(a, b) => Rejects(a, b), + SearchStep::Done => Done + } + } +} + +impl From> for Step { + fn from(x: Option<(usize, usize)>) -> Self { + match x { + Some((a, b)) => InRange(a, b), + None => Done + } + } +} + +#[test] +fn test_simple_iteration() { + search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", + // a b c d e a b c d EOF + [next, next, next, next, next, next, next, next, next, next], + [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] + ); + + search_asserts! ("abcdeabcd", 'a', "reverse iteration for ASCII string", + // d c b a e d c b a EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(8, 9), Rejects(7, 8), Rejects(6, 7), Matches(5, 6), Rejects(4, 5), Rejects(3, 4), Rejects(2, 3), Rejects(1, 2), Matches(0, 1), Done] + ); + + search_asserts! ("我爱我的猫", '我', "forward iteration for Chinese string", + // 我 愛 我 的 貓 EOF + [next, next, next, next, next, next], + [Matches(0, 3), Rejects(3, 6), Matches(6, 9), Rejects(9, 12), Rejects(12, 15), Done] + ); + + search_asserts! ("我的猫说meow", 'm', "forward iteration for mixed string", + // 我 的 猫 说 m e o w EOF + [next, next, next, next, next, next, next, next, next], + [Rejects(0, 3), Rejects(3, 6), Rejects(6, 9), Rejects(9, 12), Matches(12, 13), Rejects(13, 14), Rejects(14, 15), Rejects(15, 16), Done] + ); + + search_asserts! ("我的猫说meow", '猫', "reverse iteration for mixed string", + // w o e m 说 猫 的 我 EOF + [next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back, next_back], + [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] + ); +} From bc5535557662fb7851d80ff1538b5518af921571 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Sun, 17 Dec 2017 15:05:29 -0800 Subject: [PATCH 11/14] Add simple search test for pattern API --- src/libcore/tests/pattern.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index e12f0bc9e5f6d..bb0a618f6b870 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -74,3 +74,27 @@ fn test_simple_iteration() { [Rejects(15, 16), Rejects(14, 15), Rejects(13, 14), Rejects(12, 13), Rejects(9, 12), Matches(6, 9), Rejects(3, 6), Rejects(0, 3), Done] ); } + +#[test] +fn test_simple_search() { + search_asserts!("abcdeabcdeabcde", 'a', "next_match for ASCII string", + [next_match, next_match, next_match, next_match], + [InRange(0, 1), InRange(5, 6), InRange(10, 11), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_match_back for ASCII string", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(10, 11), InRange(5, 6), InRange(0, 1), Done] + ); + + search_asserts!("abcdeab", 'a', "next_reject for ASCII string", + [next_reject, next_reject, next_match, next_reject, next_reject], + [InRange(1, 2), InRange(2, 3), InRange(5, 6), InRange(6, 7), Done] + ); + + search_asserts!("abcdeabcdeabcde", 'a', "next_reject_back for ASCII string", + [next_reject_back, next_reject_back, next_match_back, next_reject_back, next_reject_back, next_reject_back], + [InRange(14, 15), InRange(13, 14), InRange(10, 11), InRange(9, 10), InRange(8, 9), InRange(7, 8)] + ); +} + From 9b92a4419d6a76a9de6d56adb3084d97e3e31d20 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Mon, 18 Dec 2017 03:48:07 -0800 Subject: [PATCH 12/14] Add stresstests for shared bytes for pattern API --- src/libcore/tests/pattern.rs | 154 ++++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index bb0a618f6b870..7fe274a79ed08 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -21,7 +21,7 @@ enum Step { Done } -use Step::*; +use self::Step::*; impl From for Step { fn from(x: SearchStep) -> Self { @@ -42,6 +42,12 @@ impl From> for Step { } } +// XXXManishearth these tests focus on single-character searching (CharSearcher) +// and on next()/next_match(), not next_reject(). This is because +// the memchr changes make next_match() for single chars complex, but next_reject() +// continues to use next() under the hood. We should add more test cases for all +// of these, as well as tests for StrSearcher and higher level tests for str::find() (etc) + #[test] fn test_simple_iteration() { search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", @@ -98,3 +104,149 @@ fn test_simple_search() { ); } +// Á, 각, ก, 😀 all end in 0x81 +// 🁀, ᘀ do not end in 0x81 but contain the byte +// ꁁ has 0x81 as its second and third bytes. +// +// The memchr-using implementation of next_match +// and next_match_back temporarily violate +// the property that the search is always on a unicode boundary, +// which is fine as long as this never reaches next() or next_back(). +// So we test if next() is correct after each next_match() as well. +const STRESS: &str = "Áa🁀bÁꁁfg😁각กᘀ각aÁ각ꁁก😁a"; + +#[test] +fn test_stress_indices() { + // this isn't really a test, more of documentation on the indices of each character in the stresstest string + + search_asserts!(STRESS, 'x', "Indices of characters in stress test", + [next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next, next], + [Rejects(0, 2), // Á + Rejects(2, 3), // a + Rejects(3, 7), // 🁀 + Rejects(7, 8), // b + Rejects(8, 10), // Á + Rejects(10, 13), // ꁁ + Rejects(13, 14), // f + Rejects(14, 15), // g + Rejects(15, 19), // 😀 + Rejects(19, 22), // 각 + Rejects(22, 25), // ก + Rejects(25, 28), // ᘀ + Rejects(28, 31), // 각 + Rejects(31, 32), // a + Rejects(32, 34), // Á + Rejects(34, 37), // 각 + Rejects(37, 40), // ꁁ + Rejects(40, 43), // ก + Rejects(43, 47), // 😀 + Rejects(47, 48), // a + Done] + ); +} + +#[test] +fn test_forward_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character", + [next_match, next_match, next_match, next_match], + [InRange(0, 2), InRange(8, 10), InRange(32, 34), Done] + ); + + search_asserts!(STRESS, 'Á', "Forward search for two-byte Latin character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(0, 2), Rejects(2, 3), InRange(8, 10), Rejects(10, 13), InRange(32, 34), Rejects(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character", + [next_match, next, next_match, next_match, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), InRange(34, 37), Done] + ); + + search_asserts!(STRESS, '각', "Forward search for three-byte Hangul character; check if next() still works", + [next_match, next, next_match, next, next_match, next, next_match], + [InRange(19, 22), Rejects(22, 25), InRange(28, 31), Rejects(31, 32), InRange(34, 37), Rejects(37, 40), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, 'ก', "Forward search for three-byte Thai character; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(22, 25), Rejects(25, 28), InRange(40, 43), Rejects(43, 47), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, '😁', "Forward search for four-byte emoji; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(15, 19), Rejects(19, 22), InRange(43, 47), Rejects(47, 48), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Forward search for three-byte Yi character with repeated bytes; check if next() still works", + [next_match, next, next_match, next, next_match], + [InRange(10, 13), Rejects(13, 14), InRange(37, 40), Rejects(40, 43), Done] + ); +} + +#[test] +fn test_reverse_search_shared_bytes() { + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character", + [next_match_back, next_match_back, next_match_back, next_match_back], + [InRange(32, 34), InRange(8, 10), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, 'Á', "Reverse search for two-byte Latin character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back], + [InRange(32, 34), Rejects(31, 32), InRange(8, 10), Rejects(7, 8), InRange(0, 2), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character", + [next_match_back, next_back, next_match_back, next_match_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), InRange(19, 22), Done] + ); + + search_asserts!(STRESS, '각', "Reverse search for three-byte Hangul character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(34, 37), Rejects(32, 34), InRange(28, 31), Rejects(25, 28), InRange(19, 22), Rejects(15, 19), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, 'ก', "Reverse search for three-byte Thai character; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(40, 43), Rejects(37, 40), InRange(22, 25), Rejects(19, 22), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, '😁', "Reverse search for four-byte emoji; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(43, 47), Rejects(40, 43), InRange(15, 19), Rejects(14, 15), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); + + search_asserts!(STRESS, 'ꁁ', "Reverse search for three-byte Yi character with repeated bytes; check if next_back() still works", + [next_match_back, next_back, next_match_back, next_back, next_match_back], + [InRange(37, 40), Rejects(34, 37), InRange(10, 13), Rejects(8, 10), Done] + ); +} From 85919a0b5f474783cb56cd433292865a40539665 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Fri, 22 Dec 2017 11:19:50 +0530 Subject: [PATCH 13/14] Pass tidy for tests --- src/libcore/tests/pattern.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/libcore/tests/pattern.rs b/src/libcore/tests/pattern.rs index 7fe274a79ed08..d0fd15263b219 100644 --- a/src/libcore/tests/pattern.rs +++ b/src/libcore/tests/pattern.rs @@ -1,3 +1,13 @@ +// Copyright 2017 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + use std::str::pattern::*; // This macro makes it easier to write @@ -42,7 +52,9 @@ impl From> for Step { } } -// XXXManishearth these tests focus on single-character searching (CharSearcher) +// ignore-tidy-linelength + +// FIXME(Manishearth) these tests focus on single-character searching (CharSearcher) // and on next()/next_match(), not next_reject(). This is because // the memchr changes make next_match() for single chars complex, but next_reject() // continues to use next() under the hood. We should add more test cases for all @@ -51,7 +63,7 @@ impl From> for Step { #[test] fn test_simple_iteration() { search_asserts! ("abcdeabcd", 'a', "forward iteration for ASCII string", - // a b c d e a b c d EOF + // a b c d e a b c d EOF [next, next, next, next, next, next, next, next, next, next], [Matches(0, 1), Rejects(1, 2), Rejects(2, 3), Rejects(3, 4), Rejects(4, 5), Matches(5, 6), Rejects(6, 7), Rejects(7, 8), Rejects(8, 9), Done] ); From 5cf55165fae5c8538db5c00e252ad9ba42aaf246 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Mon, 1 Jan 2018 19:55:21 +0530 Subject: [PATCH 14/14] handle overflow/underflow in index offsets --- src/libcore/str/pattern.rs | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index e44799bb9c5ab..677c0ecc33d7f 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -330,10 +330,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> { // find something. When we find something the `finger` will be set // to a UTF8 boundary. self.finger += index + 1; - let found_char = self.finger - self.utf8_size; - if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - return Some((found_char, self.finger)); + if self.finger >= self.utf8_size { + let found_char = self.finger - self.utf8_size; + if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + return Some((found_char, self.finger)); + } } } } else { @@ -386,12 +388,15 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> { // char in the paradigm of reverse iteration). For // multibyte chars we need to skip down by the number of more // bytes they have than ASCII - let found_char = index - (self.utf8_size - 1); - if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { - if slice == &self.utf8_encoded[0..self.utf8_size] { - // move finger to before the character found (i.e. at its start index) - self.finger_back = found_char; - return Some((self.finger_back, self.finger_back + self.utf8_size)); + let shift = self.utf8_size - 1; + if index >= shift { + let found_char = index - shift; + if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) { + if slice == &self.utf8_encoded[0..self.utf8_size] { + // move finger to before the character found (i.e. at its start index) + self.finger_back = found_char; + return Some((self.finger_back, self.finger_back + self.utf8_size)); + } } } // We can't use finger_back = index - size + 1 here. If we found the last char