From f0df6be3f4206df0e75abebb38e891ee9124d4ad Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 3 Mar 2017 22:45:57 -0800 Subject: [PATCH 1/7] Starting to implement rope-capable API Very much work in progress. See #21 --- src/grapheme.rs | 250 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 249 insertions(+), 1 deletion(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index 49039b7..ac7b886 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -101,6 +101,7 @@ enum GraphemeState { Regional, Emoji, Zwj, + Unknown, } impl<'a> Iterator for Graphemes<'a> { @@ -226,6 +227,7 @@ impl<'a> Iterator for Graphemes<'a> { break; } }, + Unknown => unreachable!(), } } @@ -388,7 +390,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> { take_curr = false; break; } - } + }, + Unknown => unreachable!(), } } @@ -433,3 +436,248 @@ pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> { pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> { GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) } } + +// maybe unify with PairResult? +#[derive(PartialEq, Eq)] +enum GraphemeCursorState { + Unknown, + NotBreak, + Break, + CheckCrlf, + Regional, + Emoji, +} + +pub struct GraphemeCursor { + offset: usize, // current cursor position + len: usize, // total length of the string + is_extended: bool, + state: GraphemeCursorState, + cat: Option, // category of codepoint immediately preceding cursor + catb: Option, // category of codepoint immediately after cursor + pre_context_offset: Option, + ris_count: Option, +} + +#[derive(PartialEq, Eq)] +pub enum GraphemeIncomplete { + PreContext(usize), // need pre-context for chunk ending at usize + PrevChunk, // requesting chunk previous to the one given + NextChunk, // requesting chunk following the one given + InvalidOffset, // error, chunk given is not inside cursor +} + +#[derive(PartialEq, Eq)] +enum PairResult { + NotBreak, // definitely not a break + Break, // definitely a break + Extended, // a break if in extended mode + CheckCrlf, // a break unless it's a CR LF pair + Regional, // a break if preceded by an even number of RIS + Emoji, // a break if preceded by emoji base and extend +} + +fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { + use self::PairResult::*; + use tables::grapheme::GraphemeCat::*; + match (before, after) { + (GC_Control, GC_Control) => CheckCrlf, // GB3 + (GC_Control, _) => Break, // GB4 + (_, GC_Control) => Break, // GB5 + (GC_L, GC_L) => NotBreak, // GB6 + (GC_L, GC_V) => NotBreak, // GB6 + (GC_L, GC_LV) => NotBreak, // GB6 + (GC_L, GC_LVT) => NotBreak, // GB6 + (GC_LV, GC_V) => NotBreak, // GB7 + (GC_LV, GC_T) => NotBreak, // GB7 + (GC_V, GC_V) => NotBreak, // GB7 + (GC_V, GC_T) => NotBreak, // GB7 + (GC_LVT, GC_T) => NotBreak, // GB8 + (GC_T, GC_T) => NotBreak, // GB8 + (_, GC_Extend) => NotBreak, // GB9 + (_, GC_ZWJ) => NotBreak, // GB9 + (_, GC_SpacingMark) => Extended, // GB9a + (GC_Prepend, _) => Extended, // GB9a + (GC_Base, GC_E_Modifier) => NotBreak, // GB10 + (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10 + (GC_Extend, GC_E_Modifier) => Emoji, // GB10 + (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11 + (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 + (_, _) => Break, // GB999 + } +} + +impl GraphemeCursor { + pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { + use tables::grapheme as gr; + let state = if offset == 0 || offset == len { + GraphemeCursorState::Break + } else { + GraphemeCursorState::Unknown + }; + GraphemeCursor { + offset: offset, + len: len, + state: state, + is_extended: is_extended, + cat: None, + catb: None, + pre_context_offset: None, + ris_count: None, + } + } + + pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { + use tables::grapheme as gr; + assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); + self.pre_context_offset = None; + if self.is_extended && chunk_start + chunk.len() == self.offset { + let ch = chunk.chars().rev().next().unwrap(); + if gr::grapheme_category(ch) == gr::GC_Prepend { + self.decide(false); + return; + } + } + match self.state { + GraphemeCursorState::CheckCrlf => { + let is_break = chunk.as_bytes()[chunk.len() - 1] != b'\r'; + self.decide(is_break); + } + GraphemeCursorState::Regional => self.handle_regional(chunk, chunk_start), + GraphemeCursorState::Emoji => self.handle_emoji(chunk, chunk_start), + _ => panic!("invalid state") + } + } + + fn decide(&mut self, is_break: bool) { + self.state = if is_break { + GraphemeCursorState::Break + } else { + GraphemeCursorState::NotBreak + }; + } + + fn decision(&mut self, is_break: bool) -> Result { + self.decide(is_break); + Ok(is_break) + } + + fn is_boundary_result(&self) -> Result { + if self.state == GraphemeCursorState::Break { + Ok(true) + } else if self.state == GraphemeCursorState::NotBreak { + Ok(false) + } else if let Some(pre_context_offset) = self.pre_context_offset { + Err(GraphemeIncomplete::PreContext(pre_context_offset)) + } else { + unreachable!("inconsistent state"); + } + } + + fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { + use tables::grapheme as gr; + let mut ris_count = self.ris_count.unwrap_or(0); + for ch in chunk.chars().rev() { + if gr::grapheme_category(ch) != gr::GC_Regional_Indicator { + self.ris_count = Some(ris_count); + self.decide((ris_count & 1) == 0); + return; + } + ris_count += 1; + } + self.ris_count = Some(ris_count); + if chunk_start == 0 { + self.decide((ris_count & 1) == 0); + return; + } + self.pre_context_offset = Some(chunk_start); + } + + fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { + use tables::grapheme as gr; + for ch in chunk.chars().rev() { + match gr::grapheme_category(ch) { + gr::GC_Extend => (), + gr::GC_E_Base | gr::GC_E_Base_GAZ => { + self.decide(false); + return; + } + _ => { + self.decide(true); + return; + } + } + } + if chunk_start == 0 { + self.decide(true); + return; + } + self.pre_context_offset = Some(chunk_start); + } + + pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result { + use tables::grapheme as gr; + if self.state == GraphemeCursorState::Break { + return Ok(true) + } + if self.state == GraphemeCursorState::NotBreak { + return Ok(false) + } + if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() { + return Err(GraphemeIncomplete::InvalidOffset) + } + if let Some(pre_context_offset) = self.pre_context_offset { + return Err(GraphemeIncomplete::PreContext(pre_context_offset)); + } + let offset_in_chunk = self.offset - chunk_start; + if self.catb.is_none() { + let ch = chunk[offset_in_chunk..].chars().next().unwrap(); + self.catb = Some(gr::grapheme_category(ch)); + } + if self.offset == chunk_start { + match self.catb.unwrap() { + gr::GC_Control => { + if chunk.as_bytes()[offset_in_chunk] == b'\n' { + self.state = GraphemeCursorState::CheckCrlf; + } + } + gr::GC_Regional_Indicator => self.state = GraphemeCursorState::Regional, + gr::GC_E_Modifier => self.state = GraphemeCursorState::Emoji, + _ => () + } + self.pre_context_offset = Some(chunk_start); + return Err(GraphemeIncomplete::PreContext(chunk_start)); + } + if self.cat.is_none() { + let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); + self.cat = Some(gr::grapheme_category(ch)); + } + match check_pair(self.cat.unwrap(), self.catb.unwrap()) { + PairResult::NotBreak => return self.decision(false), + PairResult::Break => return self.decision(true), + PairResult::Extended => { + let is_extended = self.is_extended; + return self.decision(is_extended); + } + PairResult::CheckCrlf => { + if chunk.as_bytes()[offset_in_chunk] != b'\n' { + return self.decision(true); + } + if self.offset > chunk_start { + return self.decision(chunk.as_bytes()[offset_in_chunk - 1] != b'\r'); + } + self.state = GraphemeCursorState::CheckCrlf; + return Err(GraphemeIncomplete::PreContext(chunk_start)); + } + PairResult::Regional => { + self.handle_regional(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() + } + PairResult::Emoji => { + self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() + } + } + } + +} From aaf9da43ad854aaa776f27a48d001675772c0422 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 3 Mar 2017 23:20:23 -0800 Subject: [PATCH 2/7] Continuing new grapheme boundary logic Implemented next_boundary and prev_boundary functions in terms of is_boundary (plus fixups to the internal state when moving the cursor). Fixed various problems in previous commit. Still work in progress, not tested yet. --- src/grapheme.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index ac7b886..7383e43 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -478,8 +478,8 @@ enum PairResult { } fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { - use self::PairResult::*; use tables::grapheme::GraphemeCat::*; + use self::PairResult::*; match (before, after) { (GC_Control, GC_Control) => CheckCrlf, // GB3 (GC_Control, _) => Break, // GB4 @@ -498,7 +498,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { (_, GC_ZWJ) => NotBreak, // GB9 (_, GC_SpacingMark) => Extended, // GB9a (GC_Prepend, _) => Extended, // GB9a - (GC_Base, GC_E_Modifier) => NotBreak, // GB10 + (GC_E_Base, GC_E_Modifier) => NotBreak, // GB10 (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10 (GC_Extend, GC_E_Modifier) => Emoji, // GB10 (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11 @@ -527,6 +527,17 @@ impl GraphemeCursor { } } + pub fn set_cursor(&mut self, offset: usize) { + if offset != self.offset { + self.offset = offset; + self.state = if offset == 0 || offset == self.len { + GraphemeCursorState::Break + } else { + GraphemeCursorState::Unknown + }; + } + } + pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { use tables::grapheme as gr; assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); @@ -534,7 +545,7 @@ impl GraphemeCursor { if self.is_extended && chunk_start + chunk.len() == self.offset { let ch = chunk.chars().rev().next().unwrap(); if gr::grapheme_category(ch) == gr::GC_Prepend { - self.decide(false); + self.decide(false); // GB9b return; } } @@ -680,4 +691,54 @@ impl GraphemeCursor { } } + pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + if self.offset == self.len { + return Ok(None); + } + loop { + let ch = chunk[self.offset - chunk_start..].chars().next().unwrap(); + self.offset += ch.len_utf8(); + self.cat = self.catb.take(); + self.state = GraphemeCursorState::Unknown; + if let (Some(ris_count), Some(cat)) = (self.ris_count, self.cat) { + if cat == GraphemeCat::GC_Regional_Indicator { + self.ris_count = Some(ris_count + 1); + } else { + self.ris_count = Some(0); + } + } + if self.offset == self.len { + self.decide(true); + } else if self.offset >= chunk_start + chunk.len() { + return Err(GraphemeIncomplete::NextChunk); + } + if self.is_boundary(chunk, chunk_start)? { + return Ok(Some(self.offset)); + } + } + } + + pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + if self.offset == 0 { + return Ok(None); + } + loop { + if self.offset == chunk_start { + return Err(GraphemeIncomplete::PrevChunk); + } + let ch = chunk[..self.offset - chunk_start].chars().rev().next().unwrap(); + self.offset -= ch.len_utf8(); + self.catb = self.cat.take(); + self.state = GraphemeCursorState::Unknown; + if let Some(ris_count) = self.ris_count { + self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None }; + } + if self.offset == 0 { + self.decide(true); + } + if self.is_boundary(chunk, chunk_start)? { + return Ok(Some(self.offset)); + } + } + } } From ae18631f337d9b8bfc104d1246a9d9113a9ba14f Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Sat, 4 Mar 2017 16:11:44 -0800 Subject: [PATCH 3/7] Switch existing iterators to cursor implementation Also, additional state machine work, mostly resume logic in next and prev grapheme cluster boundary methods. Includes some documentation, and also a bit of renaming from the earlier development drafts. --- src/grapheme.rs | 563 ++++++++++++++++-------------------------------- src/lib.rs | 1 + 2 files changed, 186 insertions(+), 378 deletions(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index 7383e43..25f463a 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -64,10 +64,8 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { #[derive(Clone)] pub struct Graphemes<'a> { string: &'a str, - extended: bool, - cat: Option, - catb: Option, - regional_count_back: Option, + cursor: GraphemeCursor, + cursor_back: GraphemeCursor, } impl<'a> Graphemes<'a> { @@ -85,350 +83,49 @@ impl<'a> Graphemes<'a> { /// assert_eq!(iter.as_str(), ""); /// ``` pub fn as_str(&self) -> &'a str { - self.string + &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()] } } -// state machine for cluster boundary rules -#[derive(Copy,Clone,PartialEq,Eq)] -enum GraphemeState { - Start, - FindExtend, - HangulL, - HangulLV, - HangulLVT, - Prepend, - Regional, - Emoji, - Zwj, - Unknown, -} - impl<'a> Iterator for Graphemes<'a> { type Item = &'a str; #[inline] fn size_hint(&self) -> (usize, Option) { - let slen = self.string.len(); + let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor(); (cmp::min(slen, 1), Some(slen)) } #[inline] fn next(&mut self) -> Option<&'a str> { - use self::GraphemeState::*; - use tables::grapheme as gr; - if self.string.len() == 0 { + let start = self.cursor.cur_cursor(); + if start == self.cursor_back.cur_cursor() { return None; } - - let mut take_curr = true; - let mut idx = 0; - let mut state = Start; - let mut cat = gr::GC_Any; - - // caching used by next_back() should be invalidated - self.regional_count_back = None; - self.catb = None; - - for (curr, ch) in self.string.char_indices() { - idx = curr; - - // retrieve cached category, if any - // We do this because most of the time we would end up - // looking up each character twice. - cat = match self.cat { - None => gr::grapheme_category(ch), - _ => self.cat.take().unwrap() - }; - - if (state, cat) == (Emoji, gr::GC_Extend) { - continue; // rule GB10 - } - - if let Some(new_state) = match cat { - gr::GC_Extend => Some(FindExtend), // rule GB9 - gr::GC_SpacingMark if self.extended => Some(FindExtend), // rule GB9a - gr::GC_ZWJ => Some(Zwj), // rule GB9/GB11 - _ => None - } { - state = new_state; - continue; - } - - state = match state { - Start if '\r' == ch => { - let slen = self.string.len(); - let nidx = idx + 1; - if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' { - idx = nidx; // rule GB3 - } - break; // rule GB4 - } - Start | Prepend => match cat { - gr::GC_Control => { // rule GB5 - take_curr = state == Start; - break; - } - gr::GC_L => HangulL, - gr::GC_LV | gr::GC_V => HangulLV, - gr::GC_LVT | gr::GC_T => HangulLVT, - gr::GC_Prepend if self.extended => Prepend, - gr::GC_Regional_Indicator => Regional, - gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji, - _ => FindExtend - }, - FindExtend => { // found non-extending when looking for extending - take_curr = false; - break; - }, - HangulL => match cat { // rule GB6: L x (L|V|LV|LVT) - gr::GC_L => continue, - gr::GC_LV | gr::GC_V => HangulLV, - gr::GC_LVT => HangulLVT, - _ => { - take_curr = false; - break; - } - }, - HangulLV => match cat { // rule GB7: (LV|V) x (V|T) - gr::GC_V => continue, - gr::GC_T => HangulLVT, - _ => { - take_curr = false; - break; - } - }, - HangulLVT => match cat { // rule GB8: (LVT|T) x T - gr::GC_T => continue, - _ => { - take_curr = false; - break; - } - }, - Regional => match cat { // rule GB12/GB13 - gr::GC_Regional_Indicator => FindExtend, - _ => { - take_curr = false; - break; - } - }, - Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier - gr::GC_E_Modifier => continue, - _ => { - take_curr = false; - break; - } - }, - Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG) - gr::GC_Glue_After_Zwj => continue, - gr::GC_E_Base_GAZ => Emoji, - _ => { - take_curr = false; - break; - } - }, - Unknown => unreachable!(), - } - } - - self.cat = if take_curr { - idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); - None - } else { - Some(cat) - }; - - let retstr = &self.string[..idx]; - self.string = &self.string[idx..]; - Some(retstr) + let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap(); + Some(&self.string[start..next]) } } impl<'a> DoubleEndedIterator for Graphemes<'a> { #[inline] fn next_back(&mut self) -> Option<&'a str> { - use self::GraphemeState::*; - use tables::grapheme as gr; - if self.string.len() == 0 { + let end = self.cursor_back.cur_cursor(); + if end == self.cursor.cur_cursor() { return None; } - - let mut take_curr = true; - let mut idx = self.string.len(); - let mut previdx = idx; - let mut state = Start; - let mut cat = gr::GC_Any; - - // caching used by next() should be invalidated - self.cat = None; - - 'outer: for (curr, ch) in self.string.char_indices().rev() { - previdx = idx; - idx = curr; - - // cached category, if any - cat = match self.catb { - None => gr::grapheme_category(ch), - _ => self.catb.take().unwrap() - }; - - // a matching state machine that runs *backwards* across an input string - // note that this has some implications for the Hangul matching, since - // we now need to know what the rightward letter is: - // - // Right to left, we have: - // L x L - // V x (L|V|LV) - // T x (V|T|LV|LVT) - // HangulL means the letter to the right is L - // HangulLV means the letter to the right is V - // HangulLVT means the letter to the right is T - state = match state { - Start if '\n' == ch => { - if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() { - idx -= 1; // rule GB3 - } - break; // rule GB4 - }, - Start | FindExtend => match cat { - gr::GC_Extend => FindExtend, - gr::GC_SpacingMark if self.extended => FindExtend, - gr::GC_ZWJ => FindExtend, - gr::GC_E_Modifier => Emoji, - gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => Zwj, - gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL, - gr::GC_V => HangulLV, - gr::GC_T => HangulLVT, - gr::GC_Regional_Indicator => Regional, - gr::GC_Control => { - take_curr = Start == state; - break; - }, - _ => break - }, - HangulL => match cat { // char to right is an L - gr::GC_L => continue, // L x L is the only legal match - _ => { - take_curr = false; - break; - } - }, - HangulLV => match cat { // char to right is a V - gr::GC_V => continue, // V x V, right char is still V - gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L - _ => { - take_curr = false; - break; - } - }, - HangulLVT => match cat { // char to right is a T - gr::GC_T => continue, // T x T, right char is still T - gr::GC_V => HangulLV, // V x T, right char is now V - gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L - _ => { - take_curr = false; - break; - } - }, - Prepend => { - // not used in reverse iteration - unreachable!() - }, - Regional => { // rule GB12/GB13 - // Need to scan backward to find if this is preceded by an odd or even number - // of Regional_Indicator characters. - let count = match self.regional_count_back { - Some(count) => count, - None => self.string[..previdx].chars().rev().take_while(|c| { - gr::grapheme_category(*c) == gr::GC_Regional_Indicator - }).count() - }; - // Cache the count to avoid re-scanning the same chars on the next iteration. - self.regional_count_back = count.checked_sub(1); - - if count % 2 == 0 { - take_curr = false; - break; - } - continue; - }, - Emoji => { // char to right is E_Modifier - // In order to decide whether to break before this E_Modifier char, we need to - // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)). - let mut ebg_idx = None; - for (startidx, prev) in self.string[..previdx].char_indices().rev() { - match (ebg_idx, gr::grapheme_category(prev)) { - (None, gr::GC_Extend) => continue, - (None, gr::GC_E_Base) => { // rule GB10 - // Found an Emoji modifier sequence. Return the whole sequence. - idx = startidx; - break 'outer; - } - (None, gr::GC_E_Base_GAZ) => { // rule GB10 - // Keep scanning in case this is part of an ZWJ x EBJ pair. - ebg_idx = Some(startidx); - } - (Some(_), gr::GC_ZWJ) => { // rule GB11 - idx = startidx; - break 'outer; - } - _ => break - } - } - if let Some(ebg_idx) = ebg_idx { - // Found an EBG without a ZWJ before it. - idx = ebg_idx; - break; - } - // Not part of an Emoji modifier sequence. Break here. - take_curr = false; - break; - }, - Zwj => match cat { // char to right is (GAZ|EBG) - gr::GC_ZWJ => FindExtend, // rule GB11: ZWJ x (GAZ|EBG) - _ => { - take_curr = false; - break; - } - }, - Unknown => unreachable!(), - } - } - - self.catb = if take_curr { - None - } else { - idx = previdx; - Some(cat) - }; - - if self.extended && cat != gr::GC_Control { - // rule GB9b: include any preceding Prepend characters - for (i, c) in self.string[..idx].char_indices().rev() { - match gr::grapheme_category(c) { - gr::GC_Prepend => idx = i, - cat => { - self.catb = Some(cat); - break; - } - } - } - } - - let retstr = &self.string[idx..]; - self.string = &self.string[..idx]; - Some(retstr) + let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap(); + Some(&self.string[prev..end]) } } #[inline] pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> { + let len = s.len(); Graphemes { string: s, - extended: is_extended, - cat: None, - catb: None, - regional_count_back: None + cursor: GraphemeCursor::new(0, len, is_extended), + cursor_back: GraphemeCursor::new(len, len, is_extended), } } @@ -438,8 +135,8 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice } // maybe unify with PairResult? -#[derive(PartialEq, Eq)] -enum GraphemeCursorState { +#[derive(PartialEq, Eq, Clone)] +enum GraphemeState { Unknown, NotBreak, Break, @@ -448,30 +145,49 @@ enum GraphemeCursorState { Emoji, } +/// Cursor-based segmenter for grapheme clusters. +#[derive(Clone)] pub struct GraphemeCursor { offset: usize, // current cursor position len: usize, // total length of the string is_extended: bool, - state: GraphemeCursorState, - cat: Option, // category of codepoint immediately preceding cursor - catb: Option, // category of codepoint immediately after cursor + state: GraphemeState, + cat_before: Option, // category of codepoint immediately preceding cursor + cat_after: Option, // category of codepoint immediately after cursor pre_context_offset: Option, ris_count: Option, + resuming: bool, // query was suspended } -#[derive(PartialEq, Eq)] +/// An error return indicating that not enough content was available in the +/// provided chunk to satisfy the query, and that more content must be provided. +#[derive(PartialEq, Eq, Debug)] pub enum GraphemeIncomplete { - PreContext(usize), // need pre-context for chunk ending at usize - PrevChunk, // requesting chunk previous to the one given + /// More pre-context is needed. The caller should call `provide_context` + /// with a chunk ending at the offset given, then retry the query. This + /// will only be returned if the `chunk_start` parameter is nonzero. + PreContext(usize), + + /// When requesting `prev_boundary`, the cursor is moving past the beginning + /// of the current chunk, so the chunk before that is requested. This will + /// only be returned if the `chunk_start` parameter is nonzero. + PrevChunk, + + /// When requesting `next_boundary`, the cursor is moving past the end of the + /// current chunk, so the chunk after that is requested. This will only be + /// returned if the chunk ends before the `len` parameter provided on + /// creation of the cursor. NextChunk, // requesting chunk following the one given - InvalidOffset, // error, chunk given is not inside cursor + + /// An error returned when the chunk given does not contain the cursor position. + InvalidOffset, } #[derive(PartialEq, Eq)] enum PairResult { NotBreak, // definitely not a break Break, // definitely a break - Extended, // a break if in extended mode + Extended, // a break if not in extended mode CheckCrlf, // a break unless it's a CR LF pair Regional, // a break if preceded by an even number of RIS Emoji, // a break if preceded by emoji base and extend @@ -502,42 +218,65 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10 (GC_Extend, GC_E_Modifier) => Emoji, // GB10 (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11 + (GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 (_, _) => Break, // GB999 } } impl GraphemeCursor { + /// Create a new cursor. The string and initial offset are given at creation + /// time, but the contents of the string are not. The `is_extended` parameter + /// controls whether extended grapheme clusters are selected. + /// + /// The `offset` parameter must be on a codepoint boundary. pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { - use tables::grapheme as gr; let state = if offset == 0 || offset == len { - GraphemeCursorState::Break + GraphemeState::Break } else { - GraphemeCursorState::Unknown + GraphemeState::Unknown }; GraphemeCursor { offset: offset, len: len, state: state, is_extended: is_extended, - cat: None, - catb: None, + cat_before: None, + cat_after: None, pre_context_offset: None, ris_count: None, + resuming: false, } } + // Not sure I'm gonna keep this, the advantage over new() seems thin. + + /// Set the cursor to a new location in the same string. pub fn set_cursor(&mut self, offset: usize) { if offset != self.offset { self.offset = offset; self.state = if offset == 0 || offset == self.len { - GraphemeCursorState::Break + GraphemeState::Break } else { - GraphemeCursorState::Unknown + GraphemeState::Unknown }; + // reset state derived from text around cursor + self.cat_before = None; + self.cat_after = None; + self.ris_count = None; } } + /// The current offset of the cursor. Equal to the last value provided to + /// `new()` or `set_cursor()`, or returned from `next_boundary()` or + /// `prev_boundary()`. + pub fn cur_cursor(&self) -> usize { + self.offset + } + + /// Provide additional pre-context when it is needed to decide a boundary. + /// The end of the chunk must coincide with the value given in the + /// `GraphemeIncomplete::PreContext` request. pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { use tables::grapheme as gr; assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); @@ -550,21 +289,21 @@ impl GraphemeCursor { } } match self.state { - GraphemeCursorState::CheckCrlf => { + GraphemeState::CheckCrlf => { let is_break = chunk.as_bytes()[chunk.len() - 1] != b'\r'; self.decide(is_break); } - GraphemeCursorState::Regional => self.handle_regional(chunk, chunk_start), - GraphemeCursorState::Emoji => self.handle_emoji(chunk, chunk_start), + GraphemeState::Regional => self.handle_regional(chunk, chunk_start), + GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), _ => panic!("invalid state") } } fn decide(&mut self, is_break: bool) { self.state = if is_break { - GraphemeCursorState::Break + GraphemeState::Break } else { - GraphemeCursorState::NotBreak + GraphemeState::NotBreak }; } @@ -574,9 +313,9 @@ impl GraphemeCursor { } fn is_boundary_result(&self) -> Result { - if self.state == GraphemeCursorState::Break { + if self.state == GraphemeState::Break { Ok(true) - } else if self.state == GraphemeCursorState::NotBreak { + } else if self.state == GraphemeState::NotBreak { Ok(false) } else if let Some(pre_context_offset) = self.pre_context_offset { Err(GraphemeIncomplete::PreContext(pre_context_offset)) @@ -591,14 +330,14 @@ impl GraphemeCursor { for ch in chunk.chars().rev() { if gr::grapheme_category(ch) != gr::GC_Regional_Indicator { self.ris_count = Some(ris_count); - self.decide((ris_count & 1) == 0); + self.decide((ris_count % 2) == 0); return; } ris_count += 1; } self.ris_count = Some(ris_count); if chunk_start == 0 { - self.decide((ris_count & 1) == 0); + self.decide((ris_count % 2) == 0); return; } self.pre_context_offset = Some(chunk_start); @@ -626,12 +365,26 @@ impl GraphemeCursor { self.pre_context_offset = Some(chunk_start); } + /// Determine whether the current cursor location is a grapheme cluster boundary. + /// Only a part of the string need be supplied. If `chunk_start` is nonzero or + /// the length of `chunk` is not equal to `len` on creation, then this method + /// may return `GraphemeIncomplete::PreContext`. The caller should then + /// call `provide_context` with the requested chunk, then retry calling this + /// method. + /// + /// For partial chunks, if the cursor is not at the beginning or end of the + /// string, the chunk should contain at least the codepoint following the cursor. + /// If the string is nonempty, the chunk must be nonempty. + /// + /// All calls should have consistent chunk contents (ie, if a chunk provides + /// content for a given slice, all further chunks covering that slice must have + /// the same content for it). pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result { use tables::grapheme as gr; - if self.state == GraphemeCursorState::Break { + if self.state == GraphemeState::Break { return Ok(true) } - if self.state == GraphemeCursorState::NotBreak { + if self.state == GraphemeState::NotBreak { return Ok(false) } if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() { @@ -641,46 +394,50 @@ impl GraphemeCursor { return Err(GraphemeIncomplete::PreContext(pre_context_offset)); } let offset_in_chunk = self.offset - chunk_start; - if self.catb.is_none() { + if self.cat_after.is_none() { let ch = chunk[offset_in_chunk..].chars().next().unwrap(); - self.catb = Some(gr::grapheme_category(ch)); + self.cat_after = Some(gr::grapheme_category(ch)); } if self.offset == chunk_start { - match self.catb.unwrap() { + match self.cat_after.unwrap() { gr::GC_Control => { if chunk.as_bytes()[offset_in_chunk] == b'\n' { - self.state = GraphemeCursorState::CheckCrlf; + self.state = GraphemeState::CheckCrlf; } } - gr::GC_Regional_Indicator => self.state = GraphemeCursorState::Regional, - gr::GC_E_Modifier => self.state = GraphemeCursorState::Emoji, + gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, + gr::GC_E_Modifier => self.state = GraphemeState::Emoji, _ => () } self.pre_context_offset = Some(chunk_start); return Err(GraphemeIncomplete::PreContext(chunk_start)); } - if self.cat.is_none() { + if self.cat_before.is_none() { let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); - self.cat = Some(gr::grapheme_category(ch)); + self.cat_before = Some(gr::grapheme_category(ch)); } - match check_pair(self.cat.unwrap(), self.catb.unwrap()) { + match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { PairResult::NotBreak => return self.decision(false), PairResult::Break => return self.decision(true), PairResult::Extended => { let is_extended = self.is_extended; - return self.decision(is_extended); + return self.decision(!is_extended); } PairResult::CheckCrlf => { if chunk.as_bytes()[offset_in_chunk] != b'\n' { return self.decision(true); } + // TODO: I think we don't have to test this if self.offset > chunk_start { return self.decision(chunk.as_bytes()[offset_in_chunk - 1] != b'\r'); } - self.state = GraphemeCursorState::CheckCrlf; + self.state = GraphemeState::CheckCrlf; return Err(GraphemeIncomplete::PreContext(chunk_start)); } PairResult::Regional => { + if let Some(ris_count) = self.ris_count { + return self.decision((ris_count % 2) == 0); + } self.handle_regional(&chunk[..offset_in_chunk], chunk_start); self.is_boundary_result() } @@ -691,54 +448,104 @@ impl GraphemeCursor { } } + /// Find the next boundary after the current cursor position. Only a part of + /// the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk following the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + use tables::grapheme as gr; if self.offset == self.len { return Ok(None); } + let mut iter = chunk[self.offset - chunk_start..].chars(); + let mut ch = iter.next().unwrap(); loop { - let ch = chunk[self.offset - chunk_start..].chars().next().unwrap(); - self.offset += ch.len_utf8(); - self.cat = self.catb.take(); - self.state = GraphemeCursorState::Unknown; - if let (Some(ris_count), Some(cat)) = (self.ris_count, self.cat) { - if cat == GraphemeCat::GC_Regional_Indicator { - self.ris_count = Some(ris_count + 1); + if self.resuming { + if self.cat_after.is_none() { + self.cat_after = Some(gr::grapheme_category(ch)); + } + } else { + self.offset += ch.len_utf8(); + self.state = GraphemeState::Unknown; + self.cat_before = self.cat_after.take(); + if self.cat_before.is_none() { + self.cat_before = Some(gr::grapheme_category(ch)); + } + if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { + self.ris_count = self.ris_count.map(|c| c + 1); } else { self.ris_count = Some(0); } + if let Some(next_ch) = iter.next() { + ch = next_ch; + self.cat_after = Some(gr::grapheme_category(ch)); + } else if self.offset == self.len { + self.decide(true); + } else { + self.resuming = true; + return Err(GraphemeIncomplete::NextChunk); + } } - if self.offset == self.len { - self.decide(true); - } else if self.offset >= chunk_start + chunk.len() { - return Err(GraphemeIncomplete::NextChunk); - } + self.resuming = true; if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; return Ok(Some(self.offset)); } + self.resuming = false; } } + /// Find the previous boundary after the current cursor position. Only a part + /// of the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk preceding the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + use tables::grapheme as gr; if self.offset == 0 { return Ok(None); } + let mut iter = chunk[..self.offset - chunk_start].chars().rev(); + let mut ch = iter.next().unwrap(); loop { if self.offset == chunk_start { + self.resuming = true; return Err(GraphemeIncomplete::PrevChunk); } - let ch = chunk[..self.offset - chunk_start].chars().rev().next().unwrap(); - self.offset -= ch.len_utf8(); - self.catb = self.cat.take(); - self.state = GraphemeCursorState::Unknown; - if let Some(ris_count) = self.ris_count { - self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None }; - } - if self.offset == 0 { - self.decide(true); + if self.resuming { + self.cat_before = Some(gr::grapheme_category(ch)); + } else { + self.offset -= ch.len_utf8(); + self.cat_after = self.cat_before.take(); + self.state = GraphemeState::Unknown; + if let Some(ris_count) = self.ris_count { + self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None }; + } + if let Some(prev_ch) = iter.next() { + ch = prev_ch; + self.cat_before = Some(gr::grapheme_category(ch)); + } else if self.offset == 0 { + self.decide(true); + } else { + self.resuming = true; + return Err(GraphemeIncomplete::PrevChunk); + } } + self.resuming = true; if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; return Ok(Some(self.offset)); } + self.resuming = false; } } } diff --git a/src/lib.rs b/src/lib.rs index 96e6e3e..6f903c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,7 @@ extern crate std; extern crate quickcheck; pub use grapheme::{Graphemes, GraphemeIndices}; +pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use tables::UNICODE_VERSION; pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords}; From 4a76978fd1c69db4d805110df4dc049a1436f6cd Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 6 Mar 2017 09:15:45 -0800 Subject: [PATCH 4/7] Additional test case This adds a test case for #19 (which was a mismatch between forward and reverse iterators in the original codebase). --- src/test.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test.rs b/src/test.rs index 3c43574..54493fe 100644 --- a/src/test.rs +++ b/src/test.rs @@ -34,6 +34,10 @@ fn test_graphemes() { // family emoji (more than two emoji joined by ZWJ) ("\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}", &["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"]), + // cartwheel emoji followed by two fitzpatrick skin tone modifiers + // (test case from issue #19) + ("\u{1F938}\u{1F3FE}\u{1F3FE}", + &["\u{1F938}\u{1F3FE}", "\u{1F3FE}"]), ]; for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) { From 0083ef50b3c9662691d198132554e57899cbc674 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 15 Mar 2017 13:15:04 -0700 Subject: [PATCH 5/7] Add comments and doc tests Some of the doc tests also encouraged me to tweak the implementation. --- src/grapheme.rs | 153 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 12 deletions(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index 25f463a..b83a09c 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -135,28 +135,51 @@ pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndice } // maybe unify with PairResult? +// An enum describing information about a potential boundary. #[derive(PartialEq, Eq, Clone)] enum GraphemeState { + // No information is known. Unknown, + // It is known to not be a boundary. NotBreak, + // It is known to be a boundary. Break, + // The codepoint after is LF, so a boundary iff the codepoint before is not CR. (GB3) CheckCrlf, + // The codepoint after is a Regional Indicator Symbol, so a boundary iff + // it is preceded by an even number of RIS codepoints. (GB12, GB13) Regional, + // The codepoint after is in the E_Modifier category, so whether it's a boundary + // depends on pre-context according to GB10. Emoji, } /// Cursor-based segmenter for grapheme clusters. #[derive(Clone)] pub struct GraphemeCursor { - offset: usize, // current cursor position - len: usize, // total length of the string + // Current cursor position. + offset: usize, + // Total length of the string. + len: usize, + // A config flag indicating whether this cursor computes legacy or extended + // grapheme cluster boundaries (enables GB9a and GB9b if set). is_extended: bool, + // Information about the potential boundary at `offset` state: GraphemeState, - cat_before: Option, // category of codepoint immediately preceding cursor - cat_after: Option, // category of codepoint immediately after cursor + // Category of codepoint immediately preceding cursor, if known. + cat_before: Option, + // Category of codepoint immediately after cursor, if known. + cat_after: Option, + // If set, at least one more codepoint immediately preceding this offset + // is needed to resolve whether there's a boundary at `offset`. pre_context_offset: Option, + // The number of RIS codepoints preceding `offset`. If `pre_context_offset` + // is set, then counts the number of RIS between that and `offset`, otherwise + // is an accurate count relative to the string. ris_count: Option, - resuming: bool, // query was suspended + // Set if a call to `prev_boundary` or `next_boundary` was suspended due + // to needing more input. + resuming: bool, } /// An error return indicating that not enough content was available in the @@ -183,14 +206,15 @@ pub enum GraphemeIncomplete { InvalidOffset, } +// An enum describing the result from lookup of a pair of categories. #[derive(PartialEq, Eq)] enum PairResult { NotBreak, // definitely not a break Break, // definitely a break - Extended, // a break if not in extended mode + Extended, // a break iff not in extended mode CheckCrlf, // a break unless it's a CR LF pair Regional, // a break if preceded by an even number of RIS - Emoji, // a break if preceded by emoji base and extend + Emoji, // a break if preceded by emoji base and (Extend)* } fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { @@ -213,7 +237,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { (_, GC_Extend) => NotBreak, // GB9 (_, GC_ZWJ) => NotBreak, // GB9 (_, GC_SpacingMark) => Extended, // GB9a - (GC_Prepend, _) => Extended, // GB9a + (GC_Prepend, _) => Extended, // GB9b (GC_E_Base, GC_E_Modifier) => NotBreak, // GB10 (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10 (GC_Extend, GC_E_Modifier) => Emoji, // GB10 @@ -230,6 +254,15 @@ impl GraphemeCursor { /// controls whether extended grapheme clusters are selected. /// /// The `offset` parameter must be on a codepoint boundary. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "हिन्दी"; + /// let mut legacy = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len()))); + /// let mut extended = GraphemeCursor::new(0, s.len(), true); + /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len()))); + /// ``` pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { let state = if offset == 0 || offset == len { GraphemeState::Break @@ -252,6 +285,15 @@ impl GraphemeCursor { // Not sure I'm gonna keep this, the advantage over new() seems thin. /// Set the cursor to a new location in the same string. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.cur_cursor(), 0); + /// cursor.set_cursor(2); + /// assert_eq!(cursor.cur_cursor(), 2); + /// ``` pub fn set_cursor(&mut self, offset: usize) { if offset != self.offset { self.offset = offset; @@ -270,6 +312,16 @@ impl GraphemeCursor { /// The current offset of the cursor. Equal to the last value provided to /// `new()` or `set_cursor()`, or returned from `next_boundary()` or /// `prev_boundary()`. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes. + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.cur_cursor(), 4); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.cur_cursor(), 8); + /// ``` pub fn cur_cursor(&self) -> usize { self.offset } @@ -277,6 +329,22 @@ impl GraphemeCursor { /// Provide additional pre-context when it is needed to decide a boundary. /// The end of the chunk must coincide with the value given in the /// `GraphemeIncomplete::PreContext` request. + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// // Note enough pre-context to decide if there's a boundary between the two flags. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8))); + /// // Provide one more Regional Indicator Symbol of pre-context + /// cursor.provide_context(&flags[4..8], 4); + /// // Still not enough context to decide. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4))); + /// // Provide additional requested context. + /// cursor.provide_context(&flags[0..4], 0); + /// // That's enough to decide (it always is when context goes to the start of the string) + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true)); + /// ``` pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { use tables::grapheme as gr; assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); @@ -379,6 +447,15 @@ impl GraphemeCursor { /// All calls should have consistent chunk contents (ie, if a chunk provides /// content for a given slice, all further chunks covering that slice must have /// the same content for it). + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true)); + /// cursor.set_cursor(12); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false)); + /// ``` pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result { use tables::grapheme as gr; if self.state == GraphemeState::Break { @@ -388,7 +465,9 @@ impl GraphemeCursor { return Ok(false) } if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() { - return Err(GraphemeIncomplete::InvalidOffset) + if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() { + return Err(GraphemeIncomplete::InvalidOffset) + } } if let Some(pre_context_offset) = self.pre_context_offset { return Err(GraphemeIncomplete::PreContext(pre_context_offset)); @@ -399,6 +478,7 @@ impl GraphemeCursor { self.cat_after = Some(gr::grapheme_category(ch)); } if self.offset == chunk_start { + let mut need_pre_context = true; match self.cat_after.unwrap() { gr::GC_Control => { if chunk.as_bytes()[offset_in_chunk] == b'\n' { @@ -407,10 +487,12 @@ impl GraphemeCursor { } gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, gr::GC_E_Modifier => self.state = GraphemeState::Emoji, - _ => () + _ => need_pre_context = self.cat_before.is_none(), + } + if need_pre_context { + self.pre_context_offset = Some(chunk_start); + return Err(GraphemeIncomplete::PreContext(chunk_start)); } - self.pre_context_offset = Some(chunk_start); - return Err(GraphemeIncomplete::PreContext(chunk_start)); } if self.cat_before.is_none() { let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); @@ -457,6 +539,29 @@ impl GraphemeCursor { /// given, then retry. /// /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings: + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk)); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None)); + /// ``` pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { use tables::grapheme as gr; if self.offset == self.len { @@ -509,6 +614,30 @@ impl GraphemeCursor { /// given, then retry. /// /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(12, flags.len(), false); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings (note the exact return is not + /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily): + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(4, s.len(), false); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk)); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None)); + /// ``` pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { use tables::grapheme as gr; if self.offset == 0 { From 92767fd0fe8dbd41f7b2c0f942ccd6a187b219b2 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 15 Mar 2017 13:25:30 -0700 Subject: [PATCH 6/7] Get rid of special case for CRLF The existing code treated CR and LF as special cases of the Control grapheme category, for reasons that weren't very good. This patch gets rid of that and just handles GB3 in the pair lookup. That should improve performance in the rope case, as it will cut down on the amount of pre-context requested when a chunk begins with LF. --- scripts/unicode.py | 12 +--- src/grapheme.rs | 29 ++-------- src/tables.rs | 136 +++++++++++++++++++++++---------------------- 3 files changed, 77 insertions(+), 100 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 53dfe74..189832b 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -330,21 +330,13 @@ def emit_break_module(f, break_table, break_cats, name): grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", []) # Control - # Note 1: + # Note: # This category also includes Cs (surrogate codepoints), but Rust's `char`s are # Unicode Scalar Values only, and surrogates are thus invalid `char`s. # Thus, we have to remove Cs from the Control category - # Note 2: - # 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes. - # However, the Graphemes iterator treats these as a special case, so they - # should be included in grapheme_cats["Control"] for our implementation. grapheme_cats["Control"] = group_cat(list( - (set(ungroup_cat(grapheme_cats["Control"])) - | set(ungroup_cat(grapheme_cats["CR"])) - | set(ungroup_cat(grapheme_cats["LF"]))) + set(ungroup_cat(grapheme_cats["Control"])) - set(ungroup_cat([surrogate_codepoints])))) - del(grapheme_cats["CR"]) - del(grapheme_cats["LF"]) grapheme_table = [] for cat in grapheme_cats: diff --git a/src/grapheme.rs b/src/grapheme.rs index b83a09c..d523e27 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -144,8 +144,6 @@ enum GraphemeState { NotBreak, // It is known to be a boundary. Break, - // The codepoint after is LF, so a boundary iff the codepoint before is not CR. (GB3) - CheckCrlf, // The codepoint after is a Regional Indicator Symbol, so a boundary iff // it is preceded by an even number of RIS codepoints. (GB12, GB13) Regional, @@ -212,7 +210,6 @@ enum PairResult { NotBreak, // definitely not a break Break, // definitely a break Extended, // a break iff not in extended mode - CheckCrlf, // a break unless it's a CR LF pair Regional, // a break if preceded by an even number of RIS Emoji, // a break if preceded by emoji base and (Extend)* } @@ -221,9 +218,13 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { use tables::grapheme::GraphemeCat::*; use self::PairResult::*; match (before, after) { - (GC_Control, GC_Control) => CheckCrlf, // GB3 + (GC_CR, GC_LF) => NotBreak, // GB3 (GC_Control, _) => Break, // GB4 + (GC_CR, _) => Break, // GB4 + (GC_LF, _) => Break, // GB4 (_, GC_Control) => Break, // GB5 + (_, GC_CR) => Break, // GB5 + (_, GC_LF) => Break, // GB5 (GC_L, GC_L) => NotBreak, // GB6 (GC_L, GC_V) => NotBreak, // GB6 (GC_L, GC_LV) => NotBreak, // GB6 @@ -357,10 +358,6 @@ impl GraphemeCursor { } } match self.state { - GraphemeState::CheckCrlf => { - let is_break = chunk.as_bytes()[chunk.len() - 1] != b'\r'; - self.decide(is_break); - } GraphemeState::Regional => self.handle_regional(chunk, chunk_start), GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), _ => panic!("invalid state") @@ -480,11 +477,6 @@ impl GraphemeCursor { if self.offset == chunk_start { let mut need_pre_context = true; match self.cat_after.unwrap() { - gr::GC_Control => { - if chunk.as_bytes()[offset_in_chunk] == b'\n' { - self.state = GraphemeState::CheckCrlf; - } - } gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, gr::GC_E_Modifier => self.state = GraphemeState::Emoji, _ => need_pre_context = self.cat_before.is_none(), @@ -505,17 +497,6 @@ impl GraphemeCursor { let is_extended = self.is_extended; return self.decision(!is_extended); } - PairResult::CheckCrlf => { - if chunk.as_bytes()[offset_in_chunk] != b'\n' { - return self.decision(true); - } - // TODO: I think we don't have to test this - if self.offset > chunk_start { - return self.decision(chunk.as_bytes()[offset_in_chunk - 1] != b'\r'); - } - self.state = GraphemeState::CheckCrlf; - return Err(GraphemeIncomplete::PreContext(chunk_start)); - } PairResult::Regional => { if let Some(ris_count) = self.ris_count { return self.decision((ris_count % 2) == 0); diff --git a/src/tables.rs b/src/tables.rs index b491f62..625a588 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -296,6 +296,7 @@ pub mod grapheme { #[derive(Clone, Copy, PartialEq, Eq)] pub enum GraphemeCat { GC_Any, + GC_CR, GC_Control, GC_E_Base, GC_E_Base_GAZ, @@ -303,6 +304,7 @@ pub mod grapheme { GC_Extend, GC_Glue_After_Zwj, GC_L, + GC_LF, GC_LV, GC_LVT, GC_Prepend, @@ -333,71 +335,73 @@ pub mod grapheme { } const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[ - ('\u{0}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', GC_Control), ('\u{ad}', '\u{ad}', - GC_Control), ('\u{300}', '\u{36f}', GC_Extend), ('\u{483}', '\u{489}', GC_Extend), - ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}', GC_Extend), ('\u{5c1}', '\u{5c2}', - GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend), ('\u{5c7}', '\u{5c7}', GC_Extend), - ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}', '\u{61a}', GC_Extend), ('\u{61c}', - '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}', GC_Extend), ('\u{670}', '\u{670}', - GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend), ('\u{6dd}', '\u{6dd}', GC_Prepend), - ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}', '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}', - GC_Extend), ('\u{70f}', '\u{70f}', GC_Prepend), ('\u{711}', '\u{711}', GC_Extend), - ('\u{730}', '\u{74a}', GC_Extend), ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}', - GC_Extend), ('\u{816}', '\u{819}', GC_Extend), ('\u{81b}', '\u{823}', GC_Extend), - ('\u{825}', '\u{827}', GC_Extend), ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}', - GC_Extend), ('\u{8d4}', '\u{8e1}', GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend), - ('\u{8e3}', '\u{902}', GC_Extend), ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}', - '\u{93a}', GC_Extend), ('\u{93b}', '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}', - GC_Extend), ('\u{93e}', '\u{940}', GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend), - ('\u{949}', '\u{94c}', GC_SpacingMark), ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}', - '\u{94f}', GC_SpacingMark), ('\u{951}', '\u{957}', GC_Extend), ('\u{962}', '\u{963}', - GC_Extend), ('\u{981}', '\u{981}', GC_Extend), ('\u{982}', '\u{983}', GC_SpacingMark), - ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}', '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}', - GC_SpacingMark), ('\u{9c1}', '\u{9c4}', GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark), - ('\u{9cb}', '\u{9cc}', GC_SpacingMark), ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}', - '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}', GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend), - ('\u{a03}', '\u{a03}', GC_SpacingMark), ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}', - '\u{a40}', GC_SpacingMark), ('\u{a41}', '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}', - GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend), ('\u{a51}', '\u{a51}', GC_Extend), - ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}', GC_Extend), ('\u{a81}', '\u{a82}', - GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark), ('\u{abc}', '\u{abc}', GC_Extend), - ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}', '\u{ac5}', GC_Extend), ('\u{ac7}', - '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}', GC_SpacingMark), ('\u{acb}', '\u{acc}', - GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend), ('\u{ae2}', '\u{ae3}', GC_Extend), - ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}', GC_SpacingMark), ('\u{b3c}', - '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend), ('\u{b40}', '\u{b40}', - GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}', '\u{b48}', GC_SpacingMark), - ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}', GC_Extend), ('\u{b56}', - '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend), ('\u{b82}', '\u{b82}', GC_Extend), - ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}', GC_SpacingMark), ('\u{bc0}', - '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark), ('\u{bc6}', '\u{bc8}', - GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}', '\u{bcd}', GC_Extend), - ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend), ('\u{c01}', '\u{c03}', - GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}', '\u{c44}', GC_SpacingMark), - ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}', GC_Extend), ('\u{c55}', '\u{c56}', - GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend), ('\u{c81}', '\u{c81}', GC_Extend), - ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}', '\u{cbc}', GC_Extend), ('\u{cbe}', - '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}', GC_Extend), ('\u{cc0}', '\u{cc1}', - GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend), ('\u{cc3}', '\u{cc4}', GC_SpacingMark), - ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}', '\u{cc8}', GC_SpacingMark), ('\u{cca}', - '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}', GC_Extend), ('\u{cd5}', '\u{cd6}', - GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend), ('\u{d01}', '\u{d01}', GC_Extend), - ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}', '\u{d3e}', GC_Extend), ('\u{d3f}', - '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}', GC_Extend), ('\u{d46}', '\u{d48}', - GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark), ('\u{d4d}', '\u{d4d}', GC_Extend), - ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}', '\u{d57}', GC_Extend), ('\u{d62}', - '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}', GC_SpacingMark), ('\u{dca}', '\u{dca}', - GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend), ('\u{dd0}', '\u{dd1}', GC_SpacingMark), - ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}', '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}', - GC_SpacingMark), ('\u{ddf}', '\u{ddf}', GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark), - ('\u{e31}', '\u{e31}', GC_Extend), ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}', - '\u{e3a}', GC_Extend), ('\u{e47}', '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend), - ('\u{eb3}', '\u{eb3}', GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}', - '\u{ebc}', GC_Extend), ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend), - ('\u{f35}', '\u{f35}', GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}', - GC_Extend), ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend), - ('\u{f7f}', '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}', - '\u{f87}', GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend), + ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control), + ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', + GC_Control), ('\u{ad}', '\u{ad}', GC_Control), ('\u{300}', '\u{36f}', GC_Extend), + ('\u{483}', '\u{489}', GC_Extend), ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}', + GC_Extend), ('\u{5c1}', '\u{5c2}', GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend), + ('\u{5c7}', '\u{5c7}', GC_Extend), ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}', + '\u{61a}', GC_Extend), ('\u{61c}', '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}', + GC_Extend), ('\u{670}', '\u{670}', GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend), + ('\u{6dd}', '\u{6dd}', GC_Prepend), ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}', + '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}', GC_Extend), ('\u{70f}', '\u{70f}', + GC_Prepend), ('\u{711}', '\u{711}', GC_Extend), ('\u{730}', '\u{74a}', GC_Extend), + ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}', GC_Extend), ('\u{816}', '\u{819}', + GC_Extend), ('\u{81b}', '\u{823}', GC_Extend), ('\u{825}', '\u{827}', GC_Extend), + ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}', GC_Extend), ('\u{8d4}', '\u{8e1}', + GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend), ('\u{8e3}', '\u{902}', GC_Extend), + ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}', '\u{93a}', GC_Extend), ('\u{93b}', + '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}', GC_Extend), ('\u{93e}', '\u{940}', + GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend), ('\u{949}', '\u{94c}', GC_SpacingMark), + ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}', '\u{94f}', GC_SpacingMark), ('\u{951}', + '\u{957}', GC_Extend), ('\u{962}', '\u{963}', GC_Extend), ('\u{981}', '\u{981}', GC_Extend), + ('\u{982}', '\u{983}', GC_SpacingMark), ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}', + '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}', GC_SpacingMark), ('\u{9c1}', '\u{9c4}', + GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark), ('\u{9cb}', '\u{9cc}', GC_SpacingMark), + ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}', '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}', + GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend), ('\u{a03}', '\u{a03}', GC_SpacingMark), + ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}', '\u{a40}', GC_SpacingMark), ('\u{a41}', + '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}', GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend), + ('\u{a51}', '\u{a51}', GC_Extend), ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}', + GC_Extend), ('\u{a81}', '\u{a82}', GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark), + ('\u{abc}', '\u{abc}', GC_Extend), ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}', + '\u{ac5}', GC_Extend), ('\u{ac7}', '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}', + GC_SpacingMark), ('\u{acb}', '\u{acc}', GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend), + ('\u{ae2}', '\u{ae3}', GC_Extend), ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}', + GC_SpacingMark), ('\u{b3c}', '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend), + ('\u{b40}', '\u{b40}', GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}', + '\u{b48}', GC_SpacingMark), ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}', + GC_Extend), ('\u{b56}', '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend), + ('\u{b82}', '\u{b82}', GC_Extend), ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}', + GC_SpacingMark), ('\u{bc0}', '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark), + ('\u{bc6}', '\u{bc8}', GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}', + '\u{bcd}', GC_Extend), ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend), + ('\u{c01}', '\u{c03}', GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}', + '\u{c44}', GC_SpacingMark), ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}', + GC_Extend), ('\u{c55}', '\u{c56}', GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend), + ('\u{c81}', '\u{c81}', GC_Extend), ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}', + '\u{cbc}', GC_Extend), ('\u{cbe}', '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}', + GC_Extend), ('\u{cc0}', '\u{cc1}', GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend), + ('\u{cc3}', '\u{cc4}', GC_SpacingMark), ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}', + '\u{cc8}', GC_SpacingMark), ('\u{cca}', '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}', + GC_Extend), ('\u{cd5}', '\u{cd6}', GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend), + ('\u{d01}', '\u{d01}', GC_Extend), ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}', + '\u{d3e}', GC_Extend), ('\u{d3f}', '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}', + GC_Extend), ('\u{d46}', '\u{d48}', GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark), + ('\u{d4d}', '\u{d4d}', GC_Extend), ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}', + '\u{d57}', GC_Extend), ('\u{d62}', '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}', + GC_SpacingMark), ('\u{dca}', '\u{dca}', GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend), + ('\u{dd0}', '\u{dd1}', GC_SpacingMark), ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}', + '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}', GC_SpacingMark), ('\u{ddf}', '\u{ddf}', + GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark), ('\u{e31}', '\u{e31}', GC_Extend), + ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}', '\u{e3a}', GC_Extend), ('\u{e47}', + '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend), ('\u{eb3}', '\u{eb3}', + GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}', '\u{ebc}', GC_Extend), + ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend), ('\u{f35}', '\u{f35}', + GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}', GC_Extend), + ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend), ('\u{f7f}', + '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}', '\u{f87}', + GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend), ('\u{fc6}', '\u{fc6}', GC_Extend), ('\u{102d}', '\u{1030}', GC_Extend), ('\u{1031}', '\u{1031}', GC_SpacingMark), ('\u{1032}', '\u{1037}', GC_Extend), ('\u{1039}', '\u{103a}', GC_Extend), ('\u{103b}', '\u{103c}', GC_SpacingMark), ('\u{103d}', '\u{103e}', GC_Extend), @@ -868,7 +872,7 @@ pub mod word { pub use self::WordCat::*; #[allow(non_camel_case_types)] - #[derive(Clone, Copy, PartialEq, Eq, Debug)] + #[derive(Clone, Copy, PartialEq, Eq)] pub enum WordCat { WC_ALetter, WC_Any, From deebd8a2d02bb462a89f2c8dcc81d64f36da6683 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 15 Mar 2017 17:15:28 -0700 Subject: [PATCH 7/7] Fix little typo --- src/grapheme.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grapheme.rs b/src/grapheme.rs index d523e27..c07fc8c 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -335,7 +335,7 @@ impl GraphemeCursor { /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); - /// // Note enough pre-context to decide if there's a boundary between the two flags. + /// // Not enough pre-context to decide if there's a boundary between the two flags. /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8))); /// // Provide one more Regional Indicator Symbol of pre-context /// cursor.provide_context(&flags[4..8], 4);