Skip to content

Commit d0a9746

Browse files
Improve boundary classification (#17005)
This PR cleans up the boundary character checking by using similar classification techniques as we used for other classification problems. For starters, this moves the boundary related items to its own file, next we setup the classification enum. Last but not least, we removed `}` as an _after_ boundary character, and instead handle that situation in the Ruby pre processor where we need it. This means the `%w{flex}` will still work in Ruby files. --- This PR is a followup for #17001, the main goal is to clean up some of the boundary character checking code. The other big improvement is performance. Changing the boundary character checking to use a classification instead results in: Took the best score of 10 runs each: ```diff - CandidateMachine: Throughput: 311.96 MB/s + CandidateMachine: Throughput: 333.52 MB/s ``` So a ~20MB/s improvement. # Test plan 1. Existing tests should pass. Due to the removal of `}` as an after boundary character, some tests are updated. 2. Added new tests to ensure the Ruby pre processor still works as expected. --------- Co-authored-by: Jordan Pittman <jordan@cryptica.me>
1 parent 57e91a6 commit d0a9746

File tree

7 files changed

+156
-61
lines changed

7 files changed

+156
-61
lines changed

crates/classification-macros/src/lib.rs

+6
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@ pub fn classify_bytes_derive(input: TokenStream) -> TokenStream {
107107
#enum_name::TABLE[byte as usize]
108108
}
109109
}
110+
111+
impl From<&u8> for #enum_name {
112+
fn from(byte: &u8) -> Self {
113+
#enum_name::TABLE[*byte as usize]
114+
}
115+
}
110116
};
111117

112118
TokenStream::from(expanded)
+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
use classification_macros::ClassifyBytes;
2+
3+
use crate::extractor::Span;
4+
5+
#[inline(always)]
6+
pub fn is_valid_before_boundary(c: &u8) -> bool {
7+
matches!(c.into(), Class::Common | Class::Before)
8+
}
9+
10+
#[inline(always)]
11+
pub fn is_valid_after_boundary(c: &u8) -> bool {
12+
matches!(c.into(), Class::Common | Class::After)
13+
}
14+
15+
#[inline(always)]
16+
pub fn has_valid_boundaries(span: &Span, input: &[u8]) -> bool {
17+
let before = {
18+
if span.start == 0 {
19+
b'\0'
20+
} else {
21+
input[span.start - 1]
22+
}
23+
};
24+
25+
let after = {
26+
if span.end >= input.len() - 1 {
27+
b'\0'
28+
} else {
29+
input[span.end + 1]
30+
}
31+
};
32+
33+
// Ensure the span has valid boundary characters before and after
34+
is_valid_before_boundary(&before) && is_valid_after_boundary(&after)
35+
}
36+
37+
#[derive(Debug, Clone, Copy, ClassifyBytes)]
38+
enum Class {
39+
// Whitespace, e.g.:
40+
//
41+
// ```
42+
// <div class="flex flex-col items-center"></div>
43+
// ^ ^
44+
// ```
45+
#[bytes(b'\t', b'\n', b'\x0C', b'\r', b' ')]
46+
// Quotes, e.g.:
47+
//
48+
// ```
49+
// <div class="flex">
50+
// ^ ^
51+
// ```
52+
#[bytes(b'"', b'\'', b'`')]
53+
// End of the input, e.g.:
54+
//
55+
// ```
56+
// flex
57+
// ^
58+
// ```
59+
#[bytes(b'\0')]
60+
Common,
61+
62+
// Angular like attributes, e.g.:
63+
//
64+
// ````
65+
// [class.foo]
66+
// ^
67+
// ```
68+
#[bytes(b'.')]
69+
// Twig-like templating languages, e.g.:
70+
//
71+
// ```
72+
// <div class="{% if true %}flex{% else %}block{% endif %}">
73+
// ^
74+
// ```
75+
#[bytes(b'}')]
76+
Before,
77+
78+
// Clojure and Angular like languages, e.g.:
79+
// ```
80+
// [:div.p-2]
81+
// ^
82+
// [class.foo]
83+
// ^
84+
// ```
85+
#[bytes(b']')]
86+
// Twig like templating languages, e.g.:
87+
//
88+
// ```
89+
// <div class="{% if true %}flex{% else %}block{% endif %}">
90+
// ^
91+
// ```
92+
#[bytes(b'{')]
93+
// Svelte like attributes, e.g.:
94+
//
95+
// ```
96+
// <div class:flex="bool"></div>
97+
// ^
98+
// ```
99+
#[bytes(b'=')]
100+
After,
101+
102+
#[fallback]
103+
Other,
104+
}

crates/oxide/src/extractor/candidate_machine.rs

+1-50
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::cursor;
2+
use crate::extractor::boundary::{has_valid_boundaries, is_valid_before_boundary};
23
use crate::extractor::machine::{Machine, MachineState};
34
use crate::extractor::utility_machine::UtilityMachine;
45
use crate::extractor::variant_machine::VariantMachine;
@@ -176,56 +177,6 @@ impl CandidateMachine {
176177
}
177178
}
178179

179-
/// A candidate must be preceded or followed by any of these characters
180-
/// E.g.: `<div class="flex">`
181-
/// ^ Valid for `flex`
182-
/// ^ Invalid for `div`
183-
#[inline(always)]
184-
fn is_valid_common_boundary(c: &u8) -> bool {
185-
matches!(
186-
c,
187-
b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'"' | b'\'' | b'`' | b'\0'
188-
)
189-
}
190-
191-
/// A candidate must be preceded by any of these characters.
192-
#[inline(always)]
193-
fn is_valid_before_boundary(c: &u8) -> bool {
194-
is_valid_common_boundary(c) || matches!(c, b'.' | b'}')
195-
}
196-
197-
/// A candidate must be followed by any of these characters.
198-
///
199-
/// E.g.: `[class.foo]` Angular
200-
/// E.g.: `<div class:flex="bool">` Svelte
201-
/// ^
202-
#[inline(always)]
203-
pub fn is_valid_after_boundary(c: &u8) -> bool {
204-
is_valid_common_boundary(c) || matches!(c, b'}' | b']' | b'=' | b'{')
205-
}
206-
207-
#[inline(always)]
208-
fn has_valid_boundaries(span: &Span, input: &[u8]) -> bool {
209-
let before = {
210-
if span.start == 0 {
211-
b'\0'
212-
} else {
213-
input[span.start - 1]
214-
}
215-
};
216-
217-
let after = {
218-
if span.end >= input.len() - 1 {
219-
b'\0'
220-
} else {
221-
input[span.end + 1]
222-
}
223-
};
224-
225-
// Ensure the span has valid boundary characters before and after
226-
is_valid_before_boundary(&before) && is_valid_after_boundary(&after)
227-
}
228-
229180
#[cfg(test)]
230181
mod tests {
231182
use super::CandidateMachine;

crates/oxide/src/extractor/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use std::fmt;
88
pub mod arbitrary_property_machine;
99
pub mod arbitrary_value_machine;
1010
pub mod arbitrary_variable_machine;
11+
mod boundary;
1112
pub mod bracket_stack;
1213
pub mod candidate_machine;
1314
pub mod css_variable_machine;

crates/oxide/src/extractor/named_utility_machine.rs

+2-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use crate::cursor;
22
use crate::extractor::arbitrary_value_machine::ArbitraryValueMachine;
33
use crate::extractor::arbitrary_variable_machine::ArbitraryVariableMachine;
4-
use crate::extractor::candidate_machine::is_valid_after_boundary;
4+
use crate::extractor::boundary::is_valid_after_boundary;
55
use crate::extractor::machine::{Machine, MachineState};
66
use classification_macros::ClassifyBytes;
77

@@ -485,10 +485,7 @@ mod tests {
485485
vec!["let", "classes", "true"],
486486
),
487487
// Inside an object (no spaces, key)
488-
(
489-
r#"let classes = {'{}':true};"#,
490-
vec!["let", "classes", "true"],
491-
),
488+
(r#"let classes = {'{}':true};"#, vec!["let", "classes"]),
492489
// Inside an object (value)
493490
(
494491
r#"let classes = { primary: '{}' };"#,

crates/oxide/src/extractor/pre_processors/ruby.rs

+41-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ impl PreProcessor for Ruby {
2727
let boundary = match cursor.curr {
2828
b'[' => b']',
2929
b'(' => b')',
30+
b'{' => b'}',
3031
_ => {
3132
cursor.advance();
3233
continue;
@@ -54,12 +55,12 @@ impl PreProcessor for Ruby {
5455
}
5556

5657
// Start of a nested bracket
57-
b'[' | b'(' => {
58+
b'[' | b'(' | b'{' => {
5859
bracket_stack.push(cursor.curr);
5960
}
6061

6162
// End of a nested bracket
62-
b']' | b')' if !bracket_stack.is_empty() => {
63+
b']' | b')' | b'}' if !bracket_stack.is_empty() => {
6364
if !bracket_stack.pop(cursor.curr) {
6465
// Unbalanced
6566
cursor.advance();
@@ -98,6 +99,12 @@ mod tests {
9899
"%w[flex data-[state=pending]:bg-[#0088cc] flex-col]",
99100
"%w flex data-[state=pending]:bg-[#0088cc] flex-col ",
100101
),
102+
// %w{…}
103+
("%w{flex px-2.5}", "%w flex px-2.5 "),
104+
(
105+
"%w{flex data-[state=pending]:bg-(--my-color) flex-col}",
106+
"%w flex data-[state=pending]:bg-(--my-color) flex-col ",
107+
),
101108
// %w(…)
102109
("%w(flex px-2.5)", "%w flex px-2.5 "),
103110
(
@@ -114,4 +121,36 @@ mod tests {
114121
Ruby::test(input, expected);
115122
}
116123
}
124+
125+
#[test]
126+
fn test_ruby_extraction() {
127+
for (input, expected) in [
128+
// %w[…]
129+
("%w[flex px-2.5]", vec!["flex", "px-2.5"]),
130+
("%w[px-2.5 flex]", vec!["flex", "px-2.5"]),
131+
("%w[2xl:flex]", vec!["2xl:flex"]),
132+
(
133+
"%w[flex data-[state=pending]:bg-[#0088cc] flex-col]",
134+
vec!["flex", "data-[state=pending]:bg-[#0088cc]", "flex-col"],
135+
),
136+
// %w{…}
137+
("%w{flex px-2.5}", vec!["flex", "px-2.5"]),
138+
("%w{px-2.5 flex}", vec!["flex", "px-2.5"]),
139+
("%w{2xl:flex}", vec!["2xl:flex"]),
140+
(
141+
"%w{flex data-[state=pending]:bg-(--my-color) flex-col}",
142+
vec!["flex", "data-[state=pending]:bg-(--my-color)", "flex-col"],
143+
),
144+
// %w(…)
145+
("%w(flex px-2.5)", vec!["flex", "px-2.5"]),
146+
("%w(px-2.5 flex)", vec!["flex", "px-2.5"]),
147+
("%w(2xl:flex)", vec!["2xl:flex"]),
148+
(
149+
"%w(flex data-[state=pending]:bg-(--my-color) flex-col)",
150+
vec!["flex", "data-[state=pending]:bg-(--my-color)", "flex-col"],
151+
),
152+
] {
153+
Ruby::test_extract_contains(input, expected);
154+
}
155+
}
117156
}

crates/oxide/src/extractor/utility_machine.rs

+1-4
Original file line numberDiff line numberDiff line change
@@ -312,10 +312,7 @@ mod tests {
312312
vec!["let", "classes", "true"],
313313
),
314314
// Inside an object (no spaces, key)
315-
(
316-
r#"let classes = {'{}':true};"#,
317-
vec!["let", "classes", "true"],
318-
),
315+
(r#"let classes = {'{}':true};"#, vec!["let", "classes"]),
319316
// Inside an object (value)
320317
(
321318
r#"let classes = { primary: '{}' };"#,

0 commit comments

Comments
 (0)