Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store core::str::CharSearcher::utf8_size as u8 #119808

Merged
merged 1 commit into from
Feb 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions library/core/src/str/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

use crate::cmp;
use crate::cmp::Ordering;
use crate::convert::TryInto as _;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised this is necessary, TryInto should be in the prelude.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like core isn't using edition 2021? It's quite weird, might have just been rust-analyzer weirdness in the Rust codebase though

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use crate::fmt;
use crate::slice::memchr;

Expand Down Expand Up @@ -370,11 +371,17 @@ pub struct CharSearcher<'a> {

// safety invariant: `utf8_size` must be less than 5
/// The number of bytes `needle` takes up when encoded in utf8.
utf8_size: usize,
utf8_size: u8,
/// A utf8 encoded copy of the `needle`
utf8_encoded: [u8; 4],
}

impl CharSearcher<'_> {
fn utf8_size(&self) -> usize {
self.utf8_size.into()
}
}

unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
#[inline]
fn haystack(&self) -> &'a str {
Expand Down Expand Up @@ -414,7 +421,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
if let Some(index) = memchr::memchr(last_byte, bytes) {
// The new finger is the index of the byte we found,
// plus one, since we memchr'd for the last byte of the character.
Expand All @@ -434,10 +441,10 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
// find something. When we find something the `finger` will be set
// to a UTF8 boundary.
self.finger += index + 1;
if self.finger >= self.utf8_size {
let found_char = self.finger - self.utf8_size;
if self.finger >= self.utf8_size() {
let found_char = self.finger - self.utf8_size();
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if slice == &self.utf8_encoded[0..self.utf8_size()] {
return Some((found_char, self.finger));
}
}
Expand Down Expand Up @@ -482,7 +489,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
let bytes = haystack.get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size() - 1) };
if let Some(index) = memchr::memrchr(last_byte, bytes) {
// we searched a slice that was offset by self.finger,
// add self.finger to recoup the original index
Expand All @@ -493,14 +500,14 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
// char in the paradigm of reverse iteration). For
// multibyte chars we need to skip down by the number of more
// bytes they have than ASCII
let shift = self.utf8_size - 1;
let shift = self.utf8_size() - 1;
if index >= shift {
let found_char = index - shift;
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size())) {
if slice == &self.utf8_encoded[0..self.utf8_size()] {
// move finger to before the character found (i.e., at its start index)
self.finger_back = found_char;
return Some((self.finger_back, self.finger_back + self.utf8_size));
return Some((self.finger_back, self.finger_back + self.utf8_size()));
}
}
}
Expand Down Expand Up @@ -542,7 +549,12 @@ impl<'a> Pattern<'a> for char {
#[inline]
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
let mut utf8_encoded = [0; 4];
let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
let utf8_size = self
.encode_utf8(&mut utf8_encoded)
.len()
.try_into()
.expect("char len should be less than 255");

CharSearcher {
haystack,
finger: 0,
Expand Down
Loading