Skip to content

Commit b82ed4d

Browse files
authored
Merge pull request #77 from cessen/faster_graphemes
Optimization for grapheme iteration.
2 parents ac54e2d + 29ecd77 commit b82ed4d

File tree

5 files changed

+94
-48
lines changed

5 files changed

+94
-48
lines changed

Diff for: scripts/unicode.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -280,22 +280,29 @@ def emit_break_module(f, break_table, break_cats, name):
280280
f.write((" %sC_" % Name[0]) + cat + ",\n")
281281
f.write(""" }
282282
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
283+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
284+
use core;
284285
use core::cmp::Ordering::{Equal, Less, Greater};
285286
match r.binary_search_by(|&(lo, hi, _)| {
286287
if lo <= c && c <= hi { Equal }
287288
else if hi < c { Less }
288289
else { Greater }
289290
}) {
290291
Ok(idx) => {
291-
let (_, _, cat) = r[idx];
292-
cat
292+
let (lower, upper, cat) = r[idx];
293+
(lower as u32, upper as u32, cat)
294+
}
295+
Err(idx) => {
296+
(
297+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
298+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
299+
%sC_Any,
300+
)
293301
}
294-
Err(_) => %sC_Any
295302
}
296303
}
297304
298-
pub fn %s_category(c: char) -> %sCat {
305+
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
299306
bsearch_range_value_table(c, %s_cat_table)
300307
}
301308

Diff for: src/grapheme.rs

+26-15
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ pub struct GraphemeCursor {
178178
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
179179
// to needing more input.
180180
resuming: bool,
181+
// Cached grapheme category and associated scalar value range.
182+
grapheme_cat_cache: (u32, u32, GraphemeCat),
181183
}
182184

183185
/// An error return indicating that not enough content was available in the
@@ -276,9 +278,20 @@ impl GraphemeCursor {
276278
pre_context_offset: None,
277279
ris_count: None,
278280
resuming: false,
281+
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
279282
}
280283
}
281284

285+
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
286+
use tables::grapheme as gr;
287+
// If this char isn't within the cached range, update the cache to the
288+
// range that includes it.
289+
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
290+
self.grapheme_cat_cache = gr::grapheme_category(ch);
291+
}
292+
self.grapheme_cat_cache.2
293+
}
294+
282295
// Not sure I'm gonna keep this, the advantage over new() seems thin.
283296

284297
/// Set the cursor to a new location in the same string.
@@ -349,7 +362,7 @@ impl GraphemeCursor {
349362
self.pre_context_offset = None;
350363
if self.is_extended && chunk_start + chunk.len() == self.offset {
351364
let ch = chunk.chars().rev().next().unwrap();
352-
if gr::grapheme_category(ch) == gr::GC_Prepend {
365+
if self.grapheme_category(ch) == gr::GC_Prepend {
353366
self.decide(false); // GB9b
354367
return;
355368
}
@@ -359,7 +372,7 @@ impl GraphemeCursor {
359372
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
360373
_ => if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
361374
let ch = chunk.chars().rev().next().unwrap();
362-
self.cat_before = Some(gr::grapheme_category(ch));
375+
self.cat_before = Some(self.grapheme_category(ch));
363376
},
364377
}
365378
}
@@ -393,7 +406,7 @@ impl GraphemeCursor {
393406
use tables::grapheme as gr;
394407
let mut ris_count = self.ris_count.unwrap_or(0);
395408
for ch in chunk.chars().rev() {
396-
if gr::grapheme_category(ch) != gr::GC_Regional_Indicator {
409+
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
397410
self.ris_count = Some(ris_count);
398411
self.decide((ris_count % 2) == 0);
399412
return;
@@ -413,13 +426,13 @@ impl GraphemeCursor {
413426
use tables::grapheme as gr;
414427
let mut iter = chunk.chars().rev();
415428
if let Some(ch) = iter.next() {
416-
if gr::grapheme_category(ch) != gr::GC_ZWJ {
429+
if self.grapheme_category(ch) != gr::GC_ZWJ {
417430
self.decide(true);
418431
return;
419432
}
420433
}
421434
for ch in iter {
422-
match gr::grapheme_category(ch) {
435+
match self.grapheme_category(ch) {
423436
gr::GC_Extend => (),
424437
gr::GC_Extended_Pictographic => {
425438
self.decide(false);
@@ -481,7 +494,7 @@ impl GraphemeCursor {
481494
let offset_in_chunk = self.offset - chunk_start;
482495
if self.cat_after.is_none() {
483496
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
484-
self.cat_after = Some(gr::grapheme_category(ch));
497+
self.cat_after = Some(self.grapheme_category(ch));
485498
}
486499
if self.offset == chunk_start {
487500
let mut need_pre_context = true;
@@ -497,7 +510,7 @@ impl GraphemeCursor {
497510
}
498511
if self.cat_before.is_none() {
499512
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
500-
self.cat_before = Some(gr::grapheme_category(ch));
513+
self.cat_before = Some(self.grapheme_category(ch));
501514
}
502515
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
503516
PairResult::NotBreak => return self.decision(false),
@@ -553,7 +566,6 @@ impl GraphemeCursor {
553566
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
554567
/// ```
555568
pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
556-
use tables::grapheme as gr;
557569
if self.offset == self.len {
558570
return Ok(None);
559571
}
@@ -562,14 +574,14 @@ impl GraphemeCursor {
562574
loop {
563575
if self.resuming {
564576
if self.cat_after.is_none() {
565-
self.cat_after = Some(gr::grapheme_category(ch));
577+
self.cat_after = Some(self.grapheme_category(ch));
566578
}
567579
} else {
568580
self.offset += ch.len_utf8();
569581
self.state = GraphemeState::Unknown;
570582
self.cat_before = self.cat_after.take();
571583
if self.cat_before.is_none() {
572-
self.cat_before = Some(gr::grapheme_category(ch));
584+
self.cat_before = Some(self.grapheme_category(ch));
573585
}
574586
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
575587
self.ris_count = self.ris_count.map(|c| c + 1);
@@ -578,7 +590,7 @@ impl GraphemeCursor {
578590
}
579591
if let Some(next_ch) = iter.next() {
580592
ch = next_ch;
581-
self.cat_after = Some(gr::grapheme_category(ch));
593+
self.cat_after = Some(self.grapheme_category(ch));
582594
} else if self.offset == self.len {
583595
self.decide(true);
584596
} else {
@@ -629,7 +641,6 @@ impl GraphemeCursor {
629641
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
630642
/// ```
631643
pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result<Option<usize>, GraphemeIncomplete> {
632-
use tables::grapheme as gr;
633644
if self.offset == 0 {
634645
return Ok(None);
635646
}
@@ -644,7 +655,7 @@ impl GraphemeCursor {
644655
return Err(GraphemeIncomplete::PrevChunk);
645656
}
646657
if self.resuming {
647-
self.cat_before = Some(gr::grapheme_category(ch));
658+
self.cat_before = Some(self.grapheme_category(ch));
648659
} else {
649660
self.offset -= ch.len_utf8();
650661
self.cat_after = self.cat_before.take();
@@ -654,12 +665,12 @@ impl GraphemeCursor {
654665
}
655666
if let Some(prev_ch) = iter.next() {
656667
ch = prev_ch;
657-
self.cat_before = Some(gr::grapheme_category(ch));
668+
self.cat_before = Some(self.grapheme_category(ch));
658669
} else if self.offset == 0 {
659670
self.decide(true);
660671
} else {
661672
self.resuming = true;
662-
self.cat_after = Some(gr::grapheme_category(ch));
673+
self.cat_after = Some(self.grapheme_category(ch));
663674
return Err(GraphemeIncomplete::PrevChunk);
664675
}
665676
}

Diff for: src/sentence.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ mod fwd {
115115

116116
for next_char in ahead.chars() {
117117
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
118-
match se::sentence_category(next_char) {
118+
match se::sentence_category(next_char).2 {
119119
se::SC_Lower => return true,
120120
se::SC_OLetter |
121121
se::SC_Upper |
@@ -182,7 +182,7 @@ mod fwd {
182182
let position_before = self.pos;
183183
let state_before = self.state.clone();
184184

185-
let next_cat = se::sentence_category(next_char);
185+
let next_cat = se::sentence_category(next_char).2;
186186

187187
self.pos += next_char.len_utf8();
188188
self.state = self.state.next(next_cat);

Diff for: src/tables.rs

+48-20
Original file line numberDiff line numberDiff line change
@@ -345,22 +345,29 @@ pub mod grapheme {
345345
GC_ZWJ,
346346
}
347347

348-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
348+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) {
349+
use core;
349350
use core::cmp::Ordering::{Equal, Less, Greater};
350351
match r.binary_search_by(|&(lo, hi, _)| {
351352
if lo <= c && c <= hi { Equal }
352353
else if hi < c { Less }
353354
else { Greater }
354355
}) {
355356
Ok(idx) => {
356-
let (_, _, cat) = r[idx];
357-
cat
357+
let (lower, upper, cat) = r[idx];
358+
(lower as u32, upper as u32, cat)
359+
}
360+
Err(idx) => {
361+
(
362+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
363+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
364+
GC_Any,
365+
)
358366
}
359-
Err(_) => GC_Any
360367
}
361368
}
362369

363-
pub fn grapheme_category(c: char) -> GraphemeCat {
370+
pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) {
364371
bsearch_range_value_table(c, grapheme_cat_table)
365372
}
366373

@@ -980,22 +987,29 @@ pub mod word {
980987
WC_ZWJ,
981988
}
982989

983-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> WordCat {
990+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) {
991+
use core;
984992
use core::cmp::Ordering::{Equal, Less, Greater};
985993
match r.binary_search_by(|&(lo, hi, _)| {
986994
if lo <= c && c <= hi { Equal }
987995
else if hi < c { Less }
988996
else { Greater }
989997
}) {
990998
Ok(idx) => {
991-
let (_, _, cat) = r[idx];
992-
cat
999+
let (lower, upper, cat) = r[idx];
1000+
(lower as u32, upper as u32, cat)
1001+
}
1002+
Err(idx) => {
1003+
(
1004+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1005+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1006+
WC_Any,
1007+
)
9931008
}
994-
Err(_) => WC_Any
9951009
}
9961010
}
9971011

998-
pub fn word_category(c: char) -> WordCat {
1012+
pub fn word_category(c: char) -> (u32, u32, WordCat) {
9991013
bsearch_range_value_table(c, word_cat_table)
10001014
}
10011015

@@ -1439,22 +1453,29 @@ pub mod emoji {
14391453
EC_Extended_Pictographic,
14401454
}
14411455

1442-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> EmojiCat {
1456+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) {
1457+
use core;
14431458
use core::cmp::Ordering::{Equal, Less, Greater};
14441459
match r.binary_search_by(|&(lo, hi, _)| {
14451460
if lo <= c && c <= hi { Equal }
14461461
else if hi < c { Less }
14471462
else { Greater }
14481463
}) {
14491464
Ok(idx) => {
1450-
let (_, _, cat) = r[idx];
1451-
cat
1465+
let (lower, upper, cat) = r[idx];
1466+
(lower as u32, upper as u32, cat)
1467+
}
1468+
Err(idx) => {
1469+
(
1470+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1471+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1472+
EC_Any,
1473+
)
14521474
}
1453-
Err(_) => EC_Any
14541475
}
14551476
}
14561477

1457-
pub fn emoji_category(c: char) -> EmojiCat {
1478+
pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) {
14581479
bsearch_range_value_table(c, emoji_cat_table)
14591480
}
14601481

@@ -1535,22 +1556,29 @@ pub mod sentence {
15351556
SC_Upper,
15361557
}
15371558

1538-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> SentenceCat {
1559+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) {
1560+
use core;
15391561
use core::cmp::Ordering::{Equal, Less, Greater};
15401562
match r.binary_search_by(|&(lo, hi, _)| {
15411563
if lo <= c && c <= hi { Equal }
15421564
else if hi < c { Less }
15431565
else { Greater }
15441566
}) {
15451567
Ok(idx) => {
1546-
let (_, _, cat) = r[idx];
1547-
cat
1568+
let (lower, upper, cat) = r[idx];
1569+
(lower as u32, upper as u32, cat)
1570+
}
1571+
Err(idx) => {
1572+
(
1573+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
1574+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
1575+
SC_Any,
1576+
)
15481577
}
1549-
Err(_) => SC_Any
15501578
}
15511579
}
15521580

1553-
pub fn sentence_category(c: char) -> SentenceCat {
1581+
pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) {
15541582
bsearch_range_value_table(c, sentence_cat_table)
15551583
}
15561584

0 commit comments

Comments
 (0)