Skip to content

Commit fb17f9e

Browse files
committed
Auto merge of #122013 - Swatinem:unicode-gen-fastpath, r=<try>
Add a lower bound check to `unicode-table-generator` output This adds a dedicated check for the lower bound (if it is outside of ASCII range) to the output of the `unicode-table-generator` tool. This generalized the ASCII-only fast-path, but only for the `Grapheme_Extend` property for now, as that is the only one with a lower bound outside of ASCII.
2 parents d5db7fb + 6d7daa0 commit fb17f9e

File tree

4 files changed

+12
-3
lines changed

4 files changed

+12
-3
lines changed

library/core/src/char/methods.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ impl char {
927927
#[must_use]
928928
#[inline]
929929
pub(crate) fn is_grapheme_extended(self) -> bool {
930-
self > '\x7f' && unicode::Grapheme_Extend(self)
930+
unicode::Grapheme_Extend(self)
931931
}
932932

933933
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/unicode_data.rs

+1
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ pub mod grapheme_extend {
316316
128, 240, 0,
317317
];
318318
pub fn lookup(c: char) -> bool {
319+
(c as u32) >= 0x300 &&
319320
super::skip_search(
320321
c as u32,
321322
&SHORT_OFFSET_RUNS,

src/tools/unicode-table-generator/src/raw_emitter.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ impl RawEmitter {
2323
}
2424

2525
fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
26+
let first_code_point = ranges.first().unwrap().start;
2627
let last_code_point = ranges.last().unwrap().end;
2728
// bitset for every bit in the codepoint range
2829
//
@@ -101,7 +102,10 @@ impl RawEmitter {
101102
)
102103
.unwrap();
103104
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
104-
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
105+
if first_code_point > 0x7f {
106+
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
107+
}
108+
writeln!(&mut self.file, " super::bitset_search(").unwrap();
105109
writeln!(&mut self.file, " c as u32,").unwrap();
106110
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
107111
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();

src/tools/unicode-table-generator/src/skiplist.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ impl ShortOffsetRunHeader {
2525

2626
impl RawEmitter {
2727
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
28+
let first_code_point = ranges.first().unwrap().start;
2829
let mut offsets = Vec::<u32>::new();
29-
let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
30+
let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::<Vec<u32>>();
3031
let mut offset = 0;
3132
for pt in points {
3233
let delta = pt - offset;
@@ -87,6 +88,9 @@ impl RawEmitter {
8788
self.bytes_used += coded_offsets.len();
8889

8990
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
91+
if first_code_point > 0x7f {
92+
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
93+
}
9094
writeln!(&mut self.file, " super::skip_search(",).unwrap();
9195
writeln!(&mut self.file, " c as u32,").unwrap();
9296
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();

0 commit comments

Comments
 (0)