Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the reflect path parser utf-8-unaware #9371

Merged
merged 5 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions crates/bevy_reflect/src/path/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,12 @@ mod tests {
#[derive(Reflect)]
struct B {
foo: usize,
bar: C,
łørđ: C,
}

#[derive(Reflect)]
struct C {
baz: f32,
mосква: f32,
}

#[derive(Reflect)]
Expand All @@ -418,21 +418,21 @@ mod tests {
enum F {
Unit,
Tuple(u32, u32),
Struct { value: char },
Şķràźÿ { 東京: char },
}

fn a_sample() -> A {
A {
w: 1,
x: B {
foo: 10,
bar: C { baz: 3.14 },
łørđ: C { mосква: 3.14 },
},
y: vec![C { baz: 1.0 }, C { baz: 2.0 }],
y: vec![C { mосква: 1.0 }, C { mосква: 2.0 }],
z: D(E(10.0, 42)),
unit_variant: F::Unit,
tuple_variant: F::Tuple(123, 321),
struct_variant: F::Struct { value: 'm' },
struct_variant: F::Şķràźÿ { 東京: 'm' },
array: [86, 75, 309],
tuple: (true, 1.23),
}
Expand Down Expand Up @@ -460,19 +460,19 @@ mod tests {
&[(access_field("x"), 1), (access_field("foo"), 2)]
);
assert_eq!(
&*ParsedPath::parse("x.bar.baz").unwrap().0,
&*ParsedPath::parse("x.łørđ.mосква").unwrap().0,
&[
(access_field("x"), 1),
(access_field("bar"), 2),
(access_field("baz"), 6)
(access_field("łørđ"), 2),
(access_field("mосква"), 10)
]
);
assert_eq!(
&*ParsedPath::parse("y[1].baz").unwrap().0,
&*ParsedPath::parse("y[1].mосква").unwrap().0,
&[
(access_field("y"), 1),
(Access::ListIndex(1), 2),
(access_field("baz"), 5)
(access_field("mосква"), 5)
]
);
assert_eq!(
Expand Down Expand Up @@ -503,14 +503,14 @@ mod tests {

let b = ParsedPath::parse("w").unwrap();
let c = ParsedPath::parse("x.foo").unwrap();
let d = ParsedPath::parse("x.bar.baz").unwrap();
let e = ParsedPath::parse("y[1].baz").unwrap();
let d = ParsedPath::parse("x.łørđ.mосква").unwrap();
let e = ParsedPath::parse("y[1].mосква").unwrap();
let f = ParsedPath::parse("z.0.1").unwrap();
let g = ParsedPath::parse("x#0").unwrap();
let h = ParsedPath::parse("x#1#0").unwrap();
let i = ParsedPath::parse("unit_variant").unwrap();
let j = ParsedPath::parse("tuple_variant.1").unwrap();
let k = ParsedPath::parse("struct_variant.value").unwrap();
let k = ParsedPath::parse("struct_variant.東京").unwrap();
let l = ParsedPath::parse("struct_variant#0").unwrap();
let m = ParsedPath::parse("array[2]").unwrap();
let n = ParsedPath::parse("tuple.1").unwrap();
Expand Down Expand Up @@ -580,15 +580,15 @@ mod tests {

assert_eq!(*a.path::<usize>("w").unwrap(), 1);
assert_eq!(*a.path::<usize>("x.foo").unwrap(), 10);
assert_eq!(*a.path::<f32>("x.bar.baz").unwrap(), 3.14);
assert_eq!(*a.path::<f32>("y[1].baz").unwrap(), 2.0);
assert_eq!(*a.path::<f32>("x.łørđ.mосква").unwrap(), 3.14);
assert_eq!(*a.path::<f32>("y[1].mосква").unwrap(), 2.0);
assert_eq!(*a.path::<usize>("z.0.1").unwrap(), 42);
assert_eq!(*a.path::<usize>("x#0").unwrap(), 10);
assert_eq!(*a.path::<f32>("x#1#0").unwrap(), 3.14);

assert_eq!(*a.path::<F>("unit_variant").unwrap(), F::Unit);
assert_eq!(*a.path::<u32>("tuple_variant.1").unwrap(), 321);
assert_eq!(*a.path::<char>("struct_variant.value").unwrap(), 'm');
assert_eq!(*a.path::<char>("struct_variant.東京").unwrap(), 'm');
assert_eq!(*a.path::<char>("struct_variant#0").unwrap(), 'm');

assert_eq!(*a.path::<i32>("array[2]").unwrap(), 309);
Expand All @@ -597,8 +597,8 @@ mod tests {
*a.path_mut::<f32>("tuple.1").unwrap() = 3.21;
assert_eq!(*a.path::<f32>("tuple.1").unwrap(), 3.21);

*a.path_mut::<f32>("y[1].baz").unwrap() = 3.0;
assert_eq!(a.y[1].baz, 3.0);
*a.path_mut::<f32>("y[1].mосква").unwrap() = 3.0;
assert_eq!(a.y[1].mосква, 3.0);

*a.path_mut::<u32>("tuple_variant.0").unwrap() = 1337;
assert_eq!(a.tuple_variant, F::Tuple(1337, 321));
Expand Down Expand Up @@ -649,8 +649,8 @@ mod tests {
&[(Access::TupleIndex(5), 1)]
);
assert_eq!(
&*ParsedPath::parse("[0].bar").unwrap().0,
&[(Access::ListIndex(0), 1), (access_field("bar"), 4)]
&*ParsedPath::parse("[0].łørđ").unwrap().0,
&[(Access::ListIndex(0), 1), (access_field("łørđ"), 4)]
);
}
}
76 changes: 47 additions & 29 deletions crates/bevy_reflect/src/path/parse.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{fmt, num::ParseIntError};
use std::{fmt, num::ParseIntError, str::from_utf8_unchecked};

use thiserror::Error;

Expand Down Expand Up @@ -33,28 +33,37 @@ enum Error<'a> {

pub(super) struct PathParser<'a> {
path: &'a str,
offset: usize,
remaining: &'a [u8],
}
impl<'a> PathParser<'a> {
pub(super) fn new(path: &'a str) -> Self {
PathParser { path, offset: 0 }
let remaining = path.as_bytes();
PathParser { path, remaining }
}

fn next_token(&mut self) -> Option<Token<'a>> {
let input = &self.path[self.offset..];
let to_parse = self.remaining;

// Return with `None` if empty.
let first_char = input.chars().next()?;
let (first_byte, remaining) = to_parse.split_first()?;

if let Some(token) = Token::symbol_from_char(first_char) {
self.offset += 1; // NOTE: we assume all symbols are ASCII
if let Some(token) = Token::symbol_from_byte(*first_byte) {
self.remaining = remaining; // NOTE: all symbols are ASCII
return Some(token);
}
// We are parsing either `0123` or `field`.
// If we do not find a subsequent token, we are at the end of the parse string.
let ident = input.split_once(Token::SYMBOLS).map_or(input, |t| t.0);

self.offset += ident.len();
let ident_len = to_parse.iter().position(|t| Token::SYMBOLS.contains(t));
let (ident, remaining) = to_parse.split_at(ident_len.unwrap_or(to_parse.len()));
// SAFETY: This relies on `self.remaining` always remaining valid UTF8:
// - self.remaining is a slice derived from self.path (valid &str)
// - The slice's end is either the same as the valid &str or
// the last byte before an ASCII utf-8 character (ie: it is a char
// boundary).
// - The slice always starts after a symbol ie: an ASCII character's boundary.
let ident = unsafe { from_utf8_unchecked(ident) };

self.remaining = remaining;
Some(Token::Ident(Ident(ident)))
}

Expand Down Expand Up @@ -82,13 +91,17 @@ impl<'a> PathParser<'a> {
}
}
}

fn offset(&self) -> usize {
self.path.len() - self.remaining.len()
}
}
impl<'a> Iterator for PathParser<'a> {
type Item = (Result<Access<'a>, ReflectPathError<'a>>, usize);

fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token()?;
let offset = self.offset;
let offset = self.offset();
let err = |error| ReflectPathError::ParseError {
offset,
path: self.path,
Expand All @@ -114,33 +127,38 @@ impl<'a> Ident<'a> {
}
}

// NOTE: We use repr(u8) so that the `match byte` in `Token::symbol_from_byte`
// becomes a "check `byte` is one of SYMBOLS and forward its value" this makes
// the optimizer happy, and shaves off a few cycles.
#[derive(Debug, PartialEq, Eq)]
#[repr(u8)]
enum Token<'a> {
Dot,
Pound,
OpenBracket,
CloseBracket,
Dot = b'.',
Pound = b'#',
OpenBracket = b'[',
CloseBracket = b']',
Ident(Ident<'a>),
}
impl fmt::Display for Token<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Token::Dot => f.write_str("."),
Token::Pound => f.write_str("#"),
Token::OpenBracket => f.write_str("["),
Token::CloseBracket => f.write_str("]"),
Token::Ident(ident) => f.write_str(ident.0),
}
let text = match self {
Token::Dot => ".",
Token::Pound => "#",
Token::OpenBracket => "[",
Token::CloseBracket => "]",
Token::Ident(ident) => ident.0,
};
f.write_str(text)
}
}
impl<'a> Token<'a> {
const SYMBOLS: &[char] = &['.', '#', '[', ']'];
fn symbol_from_char(char: char) -> Option<Self> {
match char {
'.' => Some(Self::Dot),
'#' => Some(Self::Pound),
'[' => Some(Self::OpenBracket),
']' => Some(Self::CloseBracket),
const SYMBOLS: &[u8] = b".#[]";
fn symbol_from_byte(byte: u8) -> Option<Self> {
match byte {
b'.' => Some(Self::Dot),
b'#' => Some(Self::Pound),
b'[' => Some(Self::OpenBracket),
b']' => Some(Self::CloseBracket),
_ => None,
}
}
Expand Down