From a849cf32d4b870a5dbed222d0b970ca174295e3a Mon Sep 17 00:00:00 2001 From: Nicola Papale Date: Mon, 31 Jul 2023 12:07:57 +0200 Subject: [PATCH 1/5] Make the reflect path parser utf-8-unaware This works because the delimiter symbols '.[]#' are all ASCII, and the parser really only needs to care about delimiters, so we can avoid the overhead of handling properly at all steps utf-8 strings. This is a major improvement when comparing to the previous parser. 1. `access_following` and `next_token` now inline in PathParser::next 2. Benchmarking show a 20% performance increase --- crates/bevy_reflect/src/path/parse.rs | 61 +++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/crates/bevy_reflect/src/path/parse.rs b/crates/bevy_reflect/src/path/parse.rs index fc990deb00a45..1134a88b9e485 100644 --- a/crates/bevy_reflect/src/path/parse.rs +++ b/crates/bevy_reflect/src/path/parse.rs @@ -1,4 +1,4 @@ -use std::{fmt, num::ParseIntError}; +use std::{fmt, num::ParseIntError, str::from_utf8_unchecked}; use thiserror::Error; @@ -33,28 +33,37 @@ enum Error<'a> { pub(super) struct PathParser<'a> { path: &'a str, - offset: usize, + remaining: &'a [u8], } impl<'a> PathParser<'a> { pub(super) fn new(path: &'a str) -> Self { - PathParser { path, offset: 0 } + let remaining = path.as_bytes(); + PathParser { path, remaining } } fn next_token(&mut self) -> Option> { - let input = &self.path[self.offset..]; + let to_parse = self.remaining; // Return with `None` if empty. - let first_char = input.chars().next()?; + let (first_byte, remaining) = to_parse.split_first()?; - if let Some(token) = Token::symbol_from_char(first_char) { - self.offset += 1; // NOTE: we assume all symbols are ASCII + if let Some(token) = Token::symbol_from_byte(*first_byte) { + self.remaining = remaining; // NOTE: all symbols are ASCII return Some(token); } // We are parsing either `0123` or `field`. // If we do not find a subsequent token, we are at the end of the parse string. - let ident = input.split_once(Token::SYMBOLS).map_or(input, |t| t.0); - - self.offset += ident.len(); + let ident_len = to_parse.iter().position(|t| Token::SYMBOLS.contains(t)); + let (ident, remaining) = to_parse.split_at(ident_len.unwrap_or(to_parse.len())); + // SAFETY: This relies on `self.remaining` always remaining valid UTF8: + // - self.remaining is a slice derived from self.path (valid &str) + // - The slice's end is either the same as the valid &str or + // the last byte before an ASCII utf-8 character (ie: it is a char + // boundary). + // - The slice always starts after a symbol ie: an ASCII character's boundary. + let ident = unsafe { from_utf8_unchecked(ident) }; + + self.remaining = remaining; Some(Token::Ident(Ident(ident))) } @@ -82,13 +91,17 @@ impl<'a> PathParser<'a> { } } } + + fn offset(&self) -> usize { + self.path.len() - self.remaining.len() + } } impl<'a> Iterator for PathParser<'a> { type Item = (Result, ReflectPathError<'a>>, usize); fn next(&mut self) -> Option { let token = self.next_token()?; - let offset = self.offset; + let offset = self.offset(); let err = |error| ReflectPathError::ParseError { offset, path: self.path, @@ -114,12 +127,16 @@ impl<'a> Ident<'a> { } } +// NOTE: We use repr(u8) so that the `match byte` in `Token::symbol_from_byte` +// becomes a "check `byte` is one of SYMBOLS and forward its value" this makes +// the optimizer happy, and shaves off a few cycles. #[derive(Debug, PartialEq, Eq)] +#[repr(u8)] enum Token<'a> { - Dot, - Pound, - OpenBracket, - CloseBracket, + Dot = b'.', + Pound = b'#', + OpenBracket = b'[', + CloseBracket = b']', Ident(Ident<'a>), } impl fmt::Display for Token<'_> { @@ -134,13 +151,13 @@ impl fmt::Display for Token<'_> { } } impl<'a> Token<'a> { - const SYMBOLS: &[char] = &['.', '#', '[', ']']; - fn symbol_from_char(char: char) -> Option { - match char { - '.' => Some(Self::Dot), - '#' => Some(Self::Pound), - '[' => Some(Self::OpenBracket), - ']' => Some(Self::CloseBracket), + const SYMBOLS: &[u8] = &[b'.', b'#', b'[', b']']; + fn symbol_from_byte(byte: u8) -> Option { + match byte { + b'.' => Some(Self::Dot), + b'#' => Some(Self::Pound), + b'[' => Some(Self::OpenBracket), + b']' => Some(Self::CloseBracket), _ => None, } } From eef777d931abedb391cb5dac2e4592270b950a24 Mon Sep 17 00:00:00 2001 From: Nicola Papale Date: Tue, 15 Aug 2023 16:48:48 +0200 Subject: [PATCH 2/5] Optimize Display::fmt for Token --- crates/bevy_reflect/src/path/parse.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/crates/bevy_reflect/src/path/parse.rs b/crates/bevy_reflect/src/path/parse.rs index 1134a88b9e485..c8054101d315e 100644 --- a/crates/bevy_reflect/src/path/parse.rs +++ b/crates/bevy_reflect/src/path/parse.rs @@ -141,12 +141,21 @@ enum Token<'a> { } impl fmt::Display for Token<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Token::Dot => f.write_str("."), - Token::Pound => f.write_str("#"), - Token::OpenBracket => f.write_str("["), - Token::CloseBracket => f.write_str("]"), - Token::Ident(ident) => f.write_str(ident.0), + if let Token::Ident(ident) = self { + f.write_str(ident.0) + } else { + let byte = match self { + Token::Dot => b'.', + Token::Pound => b'#', + Token::OpenBracket => b'[', + Token::CloseBracket => b']', + Token::Ident(_) => unreachable!(), + }; + let byte = &[byte]; + // SAFETY: we just defined `byte`, it is indeed a single ASCII character, + // therefore valid utf8. + let str = unsafe { from_utf8_unchecked(byte) }; + f.write_str(str) } } } From 4dc7516016f171027953a59ddce9b2e2d94eb726 Mon Sep 17 00:00:00 2001 From: Nicola Papale Date: Sun, 20 Aug 2023 19:24:32 +0200 Subject: [PATCH 3/5] Use UTF8 in reflect path tests --- crates/bevy_reflect/src/path/mod.rs | 42 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/bevy_reflect/src/path/mod.rs b/crates/bevy_reflect/src/path/mod.rs index 2e668b676664e..2e4d744cc701f 100644 --- a/crates/bevy_reflect/src/path/mod.rs +++ b/crates/bevy_reflect/src/path/mod.rs @@ -400,12 +400,12 @@ mod tests { #[derive(Reflect)] struct B { foo: usize, - bar: C, + łørđ: C, } #[derive(Reflect)] struct C { - baz: f32, + mосква: f32, } #[derive(Reflect)] @@ -418,7 +418,7 @@ mod tests { enum F { Unit, Tuple(u32, u32), - Struct { value: char }, + Şķràźÿ { 東京: char }, } fn a_sample() -> A { @@ -426,13 +426,13 @@ mod tests { w: 1, x: B { foo: 10, - bar: C { baz: 3.14 }, + łørđ: C { mосква: 3.14 }, }, - y: vec![C { baz: 1.0 }, C { baz: 2.0 }], + y: vec![C { mосква: 1.0 }, C { mосква: 2.0 }], z: D(E(10.0, 42)), unit_variant: F::Unit, tuple_variant: F::Tuple(123, 321), - struct_variant: F::Struct { value: 'm' }, + struct_variant: F::Şķràźÿ { 東京: 'm' }, array: [86, 75, 309], tuple: (true, 1.23), } @@ -460,19 +460,19 @@ mod tests { &[(access_field("x"), 1), (access_field("foo"), 2)] ); assert_eq!( - &*ParsedPath::parse("x.bar.baz").unwrap().0, + &*ParsedPath::parse("x.łørđ.mосква").unwrap().0, &[ (access_field("x"), 1), - (access_field("bar"), 2), - (access_field("baz"), 6) + (access_field("łørđ"), 2), + (access_field("mосква"), 10) ] ); assert_eq!( - &*ParsedPath::parse("y[1].baz").unwrap().0, + &*ParsedPath::parse("y[1].mосква").unwrap().0, &[ (access_field("y"), 1), (Access::ListIndex(1), 2), - (access_field("baz"), 5) + (access_field("mосква"), 5) ] ); assert_eq!( @@ -503,14 +503,14 @@ mod tests { let b = ParsedPath::parse("w").unwrap(); let c = ParsedPath::parse("x.foo").unwrap(); - let d = ParsedPath::parse("x.bar.baz").unwrap(); - let e = ParsedPath::parse("y[1].baz").unwrap(); + let d = ParsedPath::parse("x.łørđ.mосква").unwrap(); + let e = ParsedPath::parse("y[1].mосква").unwrap(); let f = ParsedPath::parse("z.0.1").unwrap(); let g = ParsedPath::parse("x#0").unwrap(); let h = ParsedPath::parse("x#1#0").unwrap(); let i = ParsedPath::parse("unit_variant").unwrap(); let j = ParsedPath::parse("tuple_variant.1").unwrap(); - let k = ParsedPath::parse("struct_variant.value").unwrap(); + let k = ParsedPath::parse("struct_variant.東京").unwrap(); let l = ParsedPath::parse("struct_variant#0").unwrap(); let m = ParsedPath::parse("array[2]").unwrap(); let n = ParsedPath::parse("tuple.1").unwrap(); @@ -580,15 +580,15 @@ mod tests { assert_eq!(*a.path::("w").unwrap(), 1); assert_eq!(*a.path::("x.foo").unwrap(), 10); - assert_eq!(*a.path::("x.bar.baz").unwrap(), 3.14); - assert_eq!(*a.path::("y[1].baz").unwrap(), 2.0); + assert_eq!(*a.path::("x.łørđ.mосква").unwrap(), 3.14); + assert_eq!(*a.path::("y[1].mосква").unwrap(), 2.0); assert_eq!(*a.path::("z.0.1").unwrap(), 42); assert_eq!(*a.path::("x#0").unwrap(), 10); assert_eq!(*a.path::("x#1#0").unwrap(), 3.14); assert_eq!(*a.path::("unit_variant").unwrap(), F::Unit); assert_eq!(*a.path::("tuple_variant.1").unwrap(), 321); - assert_eq!(*a.path::("struct_variant.value").unwrap(), 'm'); + assert_eq!(*a.path::("struct_variant.東京").unwrap(), 'm'); assert_eq!(*a.path::("struct_variant#0").unwrap(), 'm'); assert_eq!(*a.path::("array[2]").unwrap(), 309); @@ -597,8 +597,8 @@ mod tests { *a.path_mut::("tuple.1").unwrap() = 3.21; assert_eq!(*a.path::("tuple.1").unwrap(), 3.21); - *a.path_mut::("y[1].baz").unwrap() = 3.0; - assert_eq!(a.y[1].baz, 3.0); + *a.path_mut::("y[1].mосква").unwrap() = 3.0; + assert_eq!(a.y[1].mосква, 3.0); *a.path_mut::("tuple_variant.0").unwrap() = 1337; assert_eq!(a.tuple_variant, F::Tuple(1337, 321)); @@ -649,8 +649,8 @@ mod tests { &[(Access::TupleIndex(5), 1)] ); assert_eq!( - &*ParsedPath::parse("[0].bar").unwrap().0, - &[(Access::ListIndex(0), 1), (access_field("bar"), 4)] + &*ParsedPath::parse("[0].łørđ").unwrap().0, + &[(Access::ListIndex(0), 1), (access_field("łørđ"), 4)] ); } } From 20b6c5f181876883d5a0bda3edc2c4427bd18212 Mon Sep 17 00:00:00 2001 From: Nicola Papale Date: Thu, 24 Aug 2023 09:57:54 +0200 Subject: [PATCH 4/5] Use shorter syntax for SYMBOLS declaration --- crates/bevy_reflect/src/path/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/bevy_reflect/src/path/parse.rs b/crates/bevy_reflect/src/path/parse.rs index c8054101d315e..237504298e4ab 100644 --- a/crates/bevy_reflect/src/path/parse.rs +++ b/crates/bevy_reflect/src/path/parse.rs @@ -160,7 +160,7 @@ impl fmt::Display for Token<'_> { } } impl<'a> Token<'a> { - const SYMBOLS: &[u8] = &[b'.', b'#', b'[', b']']; + const SYMBOLS: &[u8] = b".#[]"; fn symbol_from_byte(byte: u8) -> Option { match byte { b'.' => Some(Self::Dot), From f512e792ec7bf0fde0f6d376cd9cc8b880c29f22 Mon Sep 17 00:00:00 2001 From: Nicola Papale Date: Fri, 25 Aug 2023 10:50:12 +0200 Subject: [PATCH 5/5] Do not use unsafe in fmt::Display Token impl --- crates/bevy_reflect/src/path/parse.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/crates/bevy_reflect/src/path/parse.rs b/crates/bevy_reflect/src/path/parse.rs index 237504298e4ab..f6e730279fc41 100644 --- a/crates/bevy_reflect/src/path/parse.rs +++ b/crates/bevy_reflect/src/path/parse.rs @@ -141,22 +141,14 @@ enum Token<'a> { } impl fmt::Display for Token<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Token::Ident(ident) = self { - f.write_str(ident.0) - } else { - let byte = match self { - Token::Dot => b'.', - Token::Pound => b'#', - Token::OpenBracket => b'[', - Token::CloseBracket => b']', - Token::Ident(_) => unreachable!(), - }; - let byte = &[byte]; - // SAFETY: we just defined `byte`, it is indeed a single ASCII character, - // therefore valid utf8. - let str = unsafe { from_utf8_unchecked(byte) }; - f.write_str(str) - } + let text = match self { + Token::Dot => ".", + Token::Pound => "#", + Token::OpenBracket => "[", + Token::CloseBracket => "]", + Token::Ident(ident) => ident.0, + }; + f.write_str(text) } } impl<'a> Token<'a> {