diff --git a/scripts/unicode.py b/scripts/unicode.py index 53dfe74..189832b 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -330,21 +330,13 @@ def emit_break_module(f, break_table, break_cats, name): grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", []) # Control - # Note 1: + # Note: # This category also includes Cs (surrogate codepoints), but Rust's `char`s are # Unicode Scalar Values only, and surrogates are thus invalid `char`s. # Thus, we have to remove Cs from the Control category - # Note 2: - # 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes. - # However, the Graphemes iterator treats these as a special case, so they - # should be included in grapheme_cats["Control"] for our implementation. grapheme_cats["Control"] = group_cat(list( - (set(ungroup_cat(grapheme_cats["Control"])) - | set(ungroup_cat(grapheme_cats["CR"])) - | set(ungroup_cat(grapheme_cats["LF"]))) + set(ungroup_cat(grapheme_cats["Control"])) - set(ungroup_cat([surrogate_codepoints])))) - del(grapheme_cats["CR"]) - del(grapheme_cats["LF"]) grapheme_table = [] for cat in grapheme_cats: diff --git a/src/grapheme.rs b/src/grapheme.rs index 49039b7..c07fc8c 100644 --- a/src/grapheme.rs +++ b/src/grapheme.rs @@ -64,10 +64,8 @@ impl<'a> DoubleEndedIterator for GraphemeIndices<'a> { #[derive(Clone)] pub struct Graphemes<'a> { string: &'a str, - extended: bool, - cat: Option, - catb: Option, - regional_count_back: Option, + cursor: GraphemeCursor, + cursor_back: GraphemeCursor, } impl<'a> Graphemes<'a> { @@ -85,351 +83,579 @@ impl<'a> Graphemes<'a> { /// assert_eq!(iter.as_str(), ""); /// ``` pub fn as_str(&self) -> &'a str { - self.string + &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()] } } -// state machine for cluster boundary rules -#[derive(Copy,Clone,PartialEq,Eq)] -enum GraphemeState { - Start, - FindExtend, - HangulL, - HangulLV, - HangulLVT, - Prepend, - Regional, - Emoji, - Zwj, -} - impl<'a> Iterator for Graphemes<'a> { type Item = &'a str; #[inline] fn size_hint(&self) -> (usize, Option) { - let slen = self.string.len(); + let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor(); (cmp::min(slen, 1), Some(slen)) } #[inline] fn next(&mut self) -> Option<&'a str> { - use self::GraphemeState::*; - use tables::grapheme as gr; - if self.string.len() == 0 { + let start = self.cursor.cur_cursor(); + if start == self.cursor_back.cur_cursor() { return None; } + let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap(); + Some(&self.string[start..next]) + } +} - let mut take_curr = true; - let mut idx = 0; - let mut state = Start; - let mut cat = gr::GC_Any; +impl<'a> DoubleEndedIterator for Graphemes<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + let end = self.cursor_back.cur_cursor(); + if end == self.cursor.cur_cursor() { + return None; + } + let prev = self.cursor_back.prev_boundary(self.string, 0).unwrap().unwrap(); + Some(&self.string[prev..end]) + } +} - // caching used by next_back() should be invalidated - self.regional_count_back = None; - self.catb = None; +#[inline] +pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> { + let len = s.len(); + Graphemes { + string: s, + cursor: GraphemeCursor::new(0, len, is_extended), + cursor_back: GraphemeCursor::new(len, len, is_extended), + } +} - for (curr, ch) in self.string.char_indices() { - idx = curr; +#[inline] +pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> { + GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) } +} - // retrieve cached category, if any - // We do this because most of the time we would end up - // looking up each character twice. - cat = match self.cat { - None => gr::grapheme_category(ch), - _ => self.cat.take().unwrap() - }; +// maybe unify with PairResult? +// An enum describing information about a potential boundary. +#[derive(PartialEq, Eq, Clone)] +enum GraphemeState { + // No information is known. + Unknown, + // It is known to not be a boundary. + NotBreak, + // It is known to be a boundary. + Break, + // The codepoint after is a Regional Indicator Symbol, so a boundary iff + // it is preceded by an even number of RIS codepoints. (GB12, GB13) + Regional, + // The codepoint after is in the E_Modifier category, so whether it's a boundary + // depends on pre-context according to GB10. + Emoji, +} - if (state, cat) == (Emoji, gr::GC_Extend) { - continue; // rule GB10 - } +/// Cursor-based segmenter for grapheme clusters. +#[derive(Clone)] +pub struct GraphemeCursor { + // Current cursor position. + offset: usize, + // Total length of the string. + len: usize, + // A config flag indicating whether this cursor computes legacy or extended + // grapheme cluster boundaries (enables GB9a and GB9b if set). + is_extended: bool, + // Information about the potential boundary at `offset` + state: GraphemeState, + // Category of codepoint immediately preceding cursor, if known. + cat_before: Option, + // Category of codepoint immediately after cursor, if known. + cat_after: Option, + // If set, at least one more codepoint immediately preceding this offset + // is needed to resolve whether there's a boundary at `offset`. + pre_context_offset: Option, + // The number of RIS codepoints preceding `offset`. If `pre_context_offset` + // is set, then counts the number of RIS between that and `offset`, otherwise + // is an accurate count relative to the string. + ris_count: Option, + // Set if a call to `prev_boundary` or `next_boundary` was suspended due + // to needing more input. + resuming: bool, +} - if let Some(new_state) = match cat { - gr::GC_Extend => Some(FindExtend), // rule GB9 - gr::GC_SpacingMark if self.extended => Some(FindExtend), // rule GB9a - gr::GC_ZWJ => Some(Zwj), // rule GB9/GB11 - _ => None - } { - state = new_state; - continue; - } +/// An error return indicating that not enough content was available in the +/// provided chunk to satisfy the query, and that more content must be provided. +#[derive(PartialEq, Eq, Debug)] +pub enum GraphemeIncomplete { + /// More pre-context is needed. The caller should call `provide_context` + /// with a chunk ending at the offset given, then retry the query. This + /// will only be returned if the `chunk_start` parameter is nonzero. + PreContext(usize), + + /// When requesting `prev_boundary`, the cursor is moving past the beginning + /// of the current chunk, so the chunk before that is requested. This will + /// only be returned if the `chunk_start` parameter is nonzero. + PrevChunk, + + /// When requesting `next_boundary`, the cursor is moving past the end of the + /// current chunk, so the chunk after that is requested. This will only be + /// returned if the chunk ends before the `len` parameter provided on + /// creation of the cursor. + NextChunk, // requesting chunk following the one given + + /// An error returned when the chunk given does not contain the cursor position. + InvalidOffset, +} - state = match state { - Start if '\r' == ch => { - let slen = self.string.len(); - let nidx = idx + 1; - if nidx != slen && self.string[nidx..].chars().next().unwrap() == '\n' { - idx = nidx; // rule GB3 - } - break; // rule GB4 - } - Start | Prepend => match cat { - gr::GC_Control => { // rule GB5 - take_curr = state == Start; - break; - } - gr::GC_L => HangulL, - gr::GC_LV | gr::GC_V => HangulLV, - gr::GC_LVT | gr::GC_T => HangulLVT, - gr::GC_Prepend if self.extended => Prepend, - gr::GC_Regional_Indicator => Regional, - gr::GC_E_Base | gr::GC_E_Base_GAZ => Emoji, - _ => FindExtend - }, - FindExtend => { // found non-extending when looking for extending - take_curr = false; - break; - }, - HangulL => match cat { // rule GB6: L x (L|V|LV|LVT) - gr::GC_L => continue, - gr::GC_LV | gr::GC_V => HangulLV, - gr::GC_LVT => HangulLVT, - _ => { - take_curr = false; - break; - } - }, - HangulLV => match cat { // rule GB7: (LV|V) x (V|T) - gr::GC_V => continue, - gr::GC_T => HangulLVT, - _ => { - take_curr = false; - break; - } - }, - HangulLVT => match cat { // rule GB8: (LVT|T) x T - gr::GC_T => continue, - _ => { - take_curr = false; - break; - } - }, - Regional => match cat { // rule GB12/GB13 - gr::GC_Regional_Indicator => FindExtend, - _ => { - take_curr = false; - break; - } - }, - Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier - gr::GC_E_Modifier => continue, - _ => { - take_curr = false; - break; - } - }, - Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG) - gr::GC_Glue_After_Zwj => continue, - gr::GC_E_Base_GAZ => Emoji, - _ => { - take_curr = false; - break; - } - }, - } - } +// An enum describing the result from lookup of a pair of categories. +#[derive(PartialEq, Eq)] +enum PairResult { + NotBreak, // definitely not a break + Break, // definitely a break + Extended, // a break iff not in extended mode + Regional, // a break if preceded by an even number of RIS + Emoji, // a break if preceded by emoji base and (Extend)* +} - self.cat = if take_curr { - idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); - None +fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult { + use tables::grapheme::GraphemeCat::*; + use self::PairResult::*; + match (before, after) { + (GC_CR, GC_LF) => NotBreak, // GB3 + (GC_Control, _) => Break, // GB4 + (GC_CR, _) => Break, // GB4 + (GC_LF, _) => Break, // GB4 + (_, GC_Control) => Break, // GB5 + (_, GC_CR) => Break, // GB5 + (_, GC_LF) => Break, // GB5 + (GC_L, GC_L) => NotBreak, // GB6 + (GC_L, GC_V) => NotBreak, // GB6 + (GC_L, GC_LV) => NotBreak, // GB6 + (GC_L, GC_LVT) => NotBreak, // GB6 + (GC_LV, GC_V) => NotBreak, // GB7 + (GC_LV, GC_T) => NotBreak, // GB7 + (GC_V, GC_V) => NotBreak, // GB7 + (GC_V, GC_T) => NotBreak, // GB7 + (GC_LVT, GC_T) => NotBreak, // GB8 + (GC_T, GC_T) => NotBreak, // GB8 + (_, GC_Extend) => NotBreak, // GB9 + (_, GC_ZWJ) => NotBreak, // GB9 + (_, GC_SpacingMark) => Extended, // GB9a + (GC_Prepend, _) => Extended, // GB9b + (GC_E_Base, GC_E_Modifier) => NotBreak, // GB10 + (GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10 + (GC_Extend, GC_E_Modifier) => Emoji, // GB10 + (GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11 + (GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11 + (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13 + (_, _) => Break, // GB999 + } +} + +impl GraphemeCursor { + /// Create a new cursor. The string and initial offset are given at creation + /// time, but the contents of the string are not. The `is_extended` parameter + /// controls whether extended grapheme clusters are selected. + /// + /// The `offset` parameter must be on a codepoint boundary. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "हिन्दी"; + /// let mut legacy = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len()))); + /// let mut extended = GraphemeCursor::new(0, s.len(), true); + /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len()))); + /// ``` + pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor { + let state = if offset == 0 || offset == len { + GraphemeState::Break } else { - Some(cat) + GraphemeState::Unknown }; + GraphemeCursor { + offset: offset, + len: len, + state: state, + is_extended: is_extended, + cat_before: None, + cat_after: None, + pre_context_offset: None, + ris_count: None, + resuming: false, + } + } - let retstr = &self.string[..idx]; - self.string = &self.string[idx..]; - Some(retstr) + // Not sure I'm gonna keep this, the advantage over new() seems thin. + + /// Set the cursor to a new location in the same string. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.cur_cursor(), 0); + /// cursor.set_cursor(2); + /// assert_eq!(cursor.cur_cursor(), 2); + /// ``` + pub fn set_cursor(&mut self, offset: usize) { + if offset != self.offset { + self.offset = offset; + self.state = if offset == 0 || offset == self.len { + GraphemeState::Break + } else { + GraphemeState::Unknown + }; + // reset state derived from text around cursor + self.cat_before = None; + self.cat_after = None; + self.ris_count = None; + } } -} -impl<'a> DoubleEndedIterator for Graphemes<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - use self::GraphemeState::*; + /// The current offset of the cursor. Equal to the last value provided to + /// `new()` or `set_cursor()`, or returned from `next_boundary()` or + /// `prev_boundary()`. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes. + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.cur_cursor(), 4); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.cur_cursor(), 8); + /// ``` + pub fn cur_cursor(&self) -> usize { + self.offset + } + + /// Provide additional pre-context when it is needed to decide a boundary. + /// The end of the chunk must coincide with the value given in the + /// `GraphemeIncomplete::PreContext` request. + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// // Not enough pre-context to decide if there's a boundary between the two flags. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8))); + /// // Provide one more Regional Indicator Symbol of pre-context + /// cursor.provide_context(&flags[4..8], 4); + /// // Still not enough context to decide. + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4))); + /// // Provide additional requested context. + /// cursor.provide_context(&flags[0..4], 0); + /// // That's enough to decide (it always is when context goes to the start of the string) + /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true)); + /// ``` + pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) { use tables::grapheme as gr; - if self.string.len() == 0 { - return None; + assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap()); + self.pre_context_offset = None; + if self.is_extended && chunk_start + chunk.len() == self.offset { + let ch = chunk.chars().rev().next().unwrap(); + if gr::grapheme_category(ch) == gr::GC_Prepend { + self.decide(false); // GB9b + return; + } + } + match self.state { + GraphemeState::Regional => self.handle_regional(chunk, chunk_start), + GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start), + _ => panic!("invalid state") } + } - let mut take_curr = true; - let mut idx = self.string.len(); - let mut previdx = idx; - let mut state = Start; - let mut cat = gr::GC_Any; + fn decide(&mut self, is_break: bool) { + self.state = if is_break { + GraphemeState::Break + } else { + GraphemeState::NotBreak + }; + } - // caching used by next() should be invalidated - self.cat = None; + fn decision(&mut self, is_break: bool) -> Result { + self.decide(is_break); + Ok(is_break) + } - 'outer: for (curr, ch) in self.string.char_indices().rev() { - previdx = idx; - idx = curr; + fn is_boundary_result(&self) -> Result { + if self.state == GraphemeState::Break { + Ok(true) + } else if self.state == GraphemeState::NotBreak { + Ok(false) + } else if let Some(pre_context_offset) = self.pre_context_offset { + Err(GraphemeIncomplete::PreContext(pre_context_offset)) + } else { + unreachable!("inconsistent state"); + } + } - // cached category, if any - cat = match self.catb { - None => gr::grapheme_category(ch), - _ => self.catb.take().unwrap() - }; + fn handle_regional(&mut self, chunk: &str, chunk_start: usize) { + use tables::grapheme as gr; + let mut ris_count = self.ris_count.unwrap_or(0); + for ch in chunk.chars().rev() { + if gr::grapheme_category(ch) != gr::GC_Regional_Indicator { + self.ris_count = Some(ris_count); + self.decide((ris_count % 2) == 0); + return; + } + ris_count += 1; + } + self.ris_count = Some(ris_count); + if chunk_start == 0 { + self.decide((ris_count % 2) == 0); + return; + } + self.pre_context_offset = Some(chunk_start); + } - // a matching state machine that runs *backwards* across an input string - // note that this has some implications for the Hangul matching, since - // we now need to know what the rightward letter is: - // - // Right to left, we have: - // L x L - // V x (L|V|LV) - // T x (V|T|LV|LVT) - // HangulL means the letter to the right is L - // HangulLV means the letter to the right is V - // HangulLVT means the letter to the right is T - state = match state { - Start if '\n' == ch => { - if idx > 0 && '\r' == self.string[..idx].chars().next_back().unwrap() { - idx -= 1; // rule GB3 - } - break; // rule GB4 - }, - Start | FindExtend => match cat { - gr::GC_Extend => FindExtend, - gr::GC_SpacingMark if self.extended => FindExtend, - gr::GC_ZWJ => FindExtend, - gr::GC_E_Modifier => Emoji, - gr::GC_Glue_After_Zwj | gr::GC_E_Base_GAZ => Zwj, - gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL, - gr::GC_V => HangulLV, - gr::GC_T => HangulLVT, - gr::GC_Regional_Indicator => Regional, - gr::GC_Control => { - take_curr = Start == state; - break; - }, - _ => break - }, - HangulL => match cat { // char to right is an L - gr::GC_L => continue, // L x L is the only legal match - _ => { - take_curr = false; - break; - } - }, - HangulLV => match cat { // char to right is a V - gr::GC_V => continue, // V x V, right char is still V - gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L - _ => { - take_curr = false; - break; - } - }, - HangulLVT => match cat { // char to right is a T - gr::GC_T => continue, // T x T, right char is still T - gr::GC_V => HangulLV, // V x T, right char is now V - gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L - _ => { - take_curr = false; - break; - } - }, - Prepend => { - // not used in reverse iteration - unreachable!() - }, - Regional => { // rule GB12/GB13 - // Need to scan backward to find if this is preceded by an odd or even number - // of Regional_Indicator characters. - let count = match self.regional_count_back { - Some(count) => count, - None => self.string[..previdx].chars().rev().take_while(|c| { - gr::grapheme_category(*c) == gr::GC_Regional_Indicator - }).count() - }; - // Cache the count to avoid re-scanning the same chars on the next iteration. - self.regional_count_back = count.checked_sub(1); - - if count % 2 == 0 { - take_curr = false; - break; - } - continue; - }, - Emoji => { // char to right is E_Modifier - // In order to decide whether to break before this E_Modifier char, we need to - // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)). - let mut ebg_idx = None; - for (startidx, prev) in self.string[..previdx].char_indices().rev() { - match (ebg_idx, gr::grapheme_category(prev)) { - (None, gr::GC_Extend) => continue, - (None, gr::GC_E_Base) => { // rule GB10 - // Found an Emoji modifier sequence. Return the whole sequence. - idx = startidx; - break 'outer; - } - (None, gr::GC_E_Base_GAZ) => { // rule GB10 - // Keep scanning in case this is part of an ZWJ x EBJ pair. - ebg_idx = Some(startidx); - } - (Some(_), gr::GC_ZWJ) => { // rule GB11 - idx = startidx; - break 'outer; - } - _ => break - } - } - if let Some(ebg_idx) = ebg_idx { - // Found an EBG without a ZWJ before it. - idx = ebg_idx; - break; - } - // Not part of an Emoji modifier sequence. Break here. - take_curr = false; - break; - }, - Zwj => match cat { // char to right is (GAZ|EBG) - gr::GC_ZWJ => FindExtend, // rule GB11: ZWJ x (GAZ|EBG) - _ => { - take_curr = false; - break; - } + fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) { + use tables::grapheme as gr; + for ch in chunk.chars().rev() { + match gr::grapheme_category(ch) { + gr::GC_Extend => (), + gr::GC_E_Base | gr::GC_E_Base_GAZ => { + self.decide(false); + return; + } + _ => { + self.decide(true); + return; } } } + if chunk_start == 0 { + self.decide(true); + return; + } + self.pre_context_offset = Some(chunk_start); + } - self.catb = if take_curr { - None - } else { - idx = previdx; - Some(cat) - }; - - if self.extended && cat != gr::GC_Control { - // rule GB9b: include any preceding Prepend characters - for (i, c) in self.string[..idx].char_indices().rev() { - match gr::grapheme_category(c) { - gr::GC_Prepend => idx = i, - cat => { - self.catb = Some(cat); - break; - } + /// Determine whether the current cursor location is a grapheme cluster boundary. + /// Only a part of the string need be supplied. If `chunk_start` is nonzero or + /// the length of `chunk` is not equal to `len` on creation, then this method + /// may return `GraphemeIncomplete::PreContext`. The caller should then + /// call `provide_context` with the requested chunk, then retry calling this + /// method. + /// + /// For partial chunks, if the cursor is not at the beginning or end of the + /// string, the chunk should contain at least the codepoint following the cursor. + /// If the string is nonempty, the chunk must be nonempty. + /// + /// All calls should have consistent chunk contents (ie, if a chunk provides + /// content for a given slice, all further chunks covering that slice must have + /// the same content for it). + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(8, flags.len(), false); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true)); + /// cursor.set_cursor(12); + /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false)); + /// ``` + pub fn is_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result { + use tables::grapheme as gr; + if self.state == GraphemeState::Break { + return Ok(true) + } + if self.state == GraphemeState::NotBreak { + return Ok(false) + } + if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() { + if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() { + return Err(GraphemeIncomplete::InvalidOffset) + } + } + if let Some(pre_context_offset) = self.pre_context_offset { + return Err(GraphemeIncomplete::PreContext(pre_context_offset)); + } + let offset_in_chunk = self.offset - chunk_start; + if self.cat_after.is_none() { + let ch = chunk[offset_in_chunk..].chars().next().unwrap(); + self.cat_after = Some(gr::grapheme_category(ch)); + } + if self.offset == chunk_start { + let mut need_pre_context = true; + match self.cat_after.unwrap() { + gr::GC_Regional_Indicator => self.state = GraphemeState::Regional, + gr::GC_E_Modifier => self.state = GraphemeState::Emoji, + _ => need_pre_context = self.cat_before.is_none(), + } + if need_pre_context { + self.pre_context_offset = Some(chunk_start); + return Err(GraphemeIncomplete::PreContext(chunk_start)); + } + } + if self.cat_before.is_none() { + let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap(); + self.cat_before = Some(gr::grapheme_category(ch)); + } + match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) { + PairResult::NotBreak => return self.decision(false), + PairResult::Break => return self.decision(true), + PairResult::Extended => { + let is_extended = self.is_extended; + return self.decision(!is_extended); + } + PairResult::Regional => { + if let Some(ris_count) = self.ris_count { + return self.decision((ris_count % 2) == 0); } + self.handle_regional(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() + } + PairResult::Emoji => { + self.handle_emoji(&chunk[..offset_in_chunk], chunk_start); + self.is_boundary_result() } } - - let retstr = &self.string[idx..]; - self.string = &self.string[..idx]; - Some(retstr) } -} -#[inline] -pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> { - Graphemes { - string: s, - extended: is_extended, - cat: None, - catb: None, - regional_count_back: None + /// Find the next boundary after the current cursor position. Only a part of + /// the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk following the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(4, flags.len(), false); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16))); + /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings: + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(0, s.len(), false); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk)); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4))); + /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None)); + /// ``` + pub fn next_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + use tables::grapheme as gr; + if self.offset == self.len { + return Ok(None); + } + let mut iter = chunk[self.offset - chunk_start..].chars(); + let mut ch = iter.next().unwrap(); + loop { + if self.resuming { + if self.cat_after.is_none() { + self.cat_after = Some(gr::grapheme_category(ch)); + } + } else { + self.offset += ch.len_utf8(); + self.state = GraphemeState::Unknown; + self.cat_before = self.cat_after.take(); + if self.cat_before.is_none() { + self.cat_before = Some(gr::grapheme_category(ch)); + } + if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator { + self.ris_count = self.ris_count.map(|c| c + 1); + } else { + self.ris_count = Some(0); + } + if let Some(next_ch) = iter.next() { + ch = next_ch; + self.cat_after = Some(gr::grapheme_category(ch)); + } else if self.offset == self.len { + self.decide(true); + } else { + self.resuming = true; + return Err(GraphemeIncomplete::NextChunk); + } + } + self.resuming = true; + if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; + return Ok(Some(self.offset)); + } + self.resuming = false; + } } -} -#[inline] -pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> { - GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) } + /// Find the previous boundary after the current cursor position. Only a part + /// of the string need be supplied. If the chunk is incomplete, then this + /// method might return `GraphemeIncomplete::PreContext` or + /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should + /// call `provide_context` with the requested chunk, then retry. In the + /// latter case, the caller should provide the chunk preceding the one + /// given, then retry. + /// + /// See `is_boundary` for expectations on the provided chunk. + /// + /// ```rust + /// # use unicode_segmentation::GraphemeCursor; + /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}"; + /// let mut cursor = GraphemeCursor::new(12, flags.len(), false); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None)); + /// ``` + /// + /// And an example that uses partial strings (note the exact return is not + /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily): + /// + /// ```rust + /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete}; + /// let s = "abcd"; + /// let mut cursor = GraphemeCursor::new(4, s.len(), false); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3))); + /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk)); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0))); + /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None)); + /// ``` + pub fn prev_boundary(&mut self, chunk: &str, chunk_start: usize) -> Result, GraphemeIncomplete> { + use tables::grapheme as gr; + if self.offset == 0 { + return Ok(None); + } + let mut iter = chunk[..self.offset - chunk_start].chars().rev(); + let mut ch = iter.next().unwrap(); + loop { + if self.offset == chunk_start { + self.resuming = true; + return Err(GraphemeIncomplete::PrevChunk); + } + if self.resuming { + self.cat_before = Some(gr::grapheme_category(ch)); + } else { + self.offset -= ch.len_utf8(); + self.cat_after = self.cat_before.take(); + self.state = GraphemeState::Unknown; + if let Some(ris_count) = self.ris_count { + self.ris_count = if ris_count > 0 { Some(ris_count - 1) } else { None }; + } + if let Some(prev_ch) = iter.next() { + ch = prev_ch; + self.cat_before = Some(gr::grapheme_category(ch)); + } else if self.offset == 0 { + self.decide(true); + } else { + self.resuming = true; + return Err(GraphemeIncomplete::PrevChunk); + } + } + self.resuming = true; + if self.is_boundary(chunk, chunk_start)? { + self.resuming = false; + return Ok(Some(self.offset)); + } + self.resuming = false; + } + } } diff --git a/src/lib.rs b/src/lib.rs index 96e6e3e..6f903c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,7 @@ extern crate std; extern crate quickcheck; pub use grapheme::{Graphemes, GraphemeIndices}; +pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use tables::UNICODE_VERSION; pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords}; diff --git a/src/tables.rs b/src/tables.rs index b491f62..625a588 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -296,6 +296,7 @@ pub mod grapheme { #[derive(Clone, Copy, PartialEq, Eq)] pub enum GraphemeCat { GC_Any, + GC_CR, GC_Control, GC_E_Base, GC_E_Base_GAZ, @@ -303,6 +304,7 @@ pub mod grapheme { GC_Extend, GC_Glue_After_Zwj, GC_L, + GC_LF, GC_LV, GC_LVT, GC_Prepend, @@ -333,71 +335,73 @@ pub mod grapheme { } const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[ - ('\u{0}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', GC_Control), ('\u{ad}', '\u{ad}', - GC_Control), ('\u{300}', '\u{36f}', GC_Extend), ('\u{483}', '\u{489}', GC_Extend), - ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}', GC_Extend), ('\u{5c1}', '\u{5c2}', - GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend), ('\u{5c7}', '\u{5c7}', GC_Extend), - ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}', '\u{61a}', GC_Extend), ('\u{61c}', - '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}', GC_Extend), ('\u{670}', '\u{670}', - GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend), ('\u{6dd}', '\u{6dd}', GC_Prepend), - ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}', '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}', - GC_Extend), ('\u{70f}', '\u{70f}', GC_Prepend), ('\u{711}', '\u{711}', GC_Extend), - ('\u{730}', '\u{74a}', GC_Extend), ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}', - GC_Extend), ('\u{816}', '\u{819}', GC_Extend), ('\u{81b}', '\u{823}', GC_Extend), - ('\u{825}', '\u{827}', GC_Extend), ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}', - GC_Extend), ('\u{8d4}', '\u{8e1}', GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend), - ('\u{8e3}', '\u{902}', GC_Extend), ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}', - '\u{93a}', GC_Extend), ('\u{93b}', '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}', - GC_Extend), ('\u{93e}', '\u{940}', GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend), - ('\u{949}', '\u{94c}', GC_SpacingMark), ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}', - '\u{94f}', GC_SpacingMark), ('\u{951}', '\u{957}', GC_Extend), ('\u{962}', '\u{963}', - GC_Extend), ('\u{981}', '\u{981}', GC_Extend), ('\u{982}', '\u{983}', GC_SpacingMark), - ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}', '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}', - GC_SpacingMark), ('\u{9c1}', '\u{9c4}', GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark), - ('\u{9cb}', '\u{9cc}', GC_SpacingMark), ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}', - '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}', GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend), - ('\u{a03}', '\u{a03}', GC_SpacingMark), ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}', - '\u{a40}', GC_SpacingMark), ('\u{a41}', '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}', - GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend), ('\u{a51}', '\u{a51}', GC_Extend), - ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}', GC_Extend), ('\u{a81}', '\u{a82}', - GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark), ('\u{abc}', '\u{abc}', GC_Extend), - ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}', '\u{ac5}', GC_Extend), ('\u{ac7}', - '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}', GC_SpacingMark), ('\u{acb}', '\u{acc}', - GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend), ('\u{ae2}', '\u{ae3}', GC_Extend), - ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}', GC_SpacingMark), ('\u{b3c}', - '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend), ('\u{b40}', '\u{b40}', - GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}', '\u{b48}', GC_SpacingMark), - ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}', GC_Extend), ('\u{b56}', - '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend), ('\u{b82}', '\u{b82}', GC_Extend), - ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}', GC_SpacingMark), ('\u{bc0}', - '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark), ('\u{bc6}', '\u{bc8}', - GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}', '\u{bcd}', GC_Extend), - ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend), ('\u{c01}', '\u{c03}', - GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}', '\u{c44}', GC_SpacingMark), - ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}', GC_Extend), ('\u{c55}', '\u{c56}', - GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend), ('\u{c81}', '\u{c81}', GC_Extend), - ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}', '\u{cbc}', GC_Extend), ('\u{cbe}', - '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}', GC_Extend), ('\u{cc0}', '\u{cc1}', - GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend), ('\u{cc3}', '\u{cc4}', GC_SpacingMark), - ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}', '\u{cc8}', GC_SpacingMark), ('\u{cca}', - '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}', GC_Extend), ('\u{cd5}', '\u{cd6}', - GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend), ('\u{d01}', '\u{d01}', GC_Extend), - ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}', '\u{d3e}', GC_Extend), ('\u{d3f}', - '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}', GC_Extend), ('\u{d46}', '\u{d48}', - GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark), ('\u{d4d}', '\u{d4d}', GC_Extend), - ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}', '\u{d57}', GC_Extend), ('\u{d62}', - '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}', GC_SpacingMark), ('\u{dca}', '\u{dca}', - GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend), ('\u{dd0}', '\u{dd1}', GC_SpacingMark), - ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}', '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}', - GC_SpacingMark), ('\u{ddf}', '\u{ddf}', GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark), - ('\u{e31}', '\u{e31}', GC_Extend), ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}', - '\u{e3a}', GC_Extend), ('\u{e47}', '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend), - ('\u{eb3}', '\u{eb3}', GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}', - '\u{ebc}', GC_Extend), ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend), - ('\u{f35}', '\u{f35}', GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}', - GC_Extend), ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend), - ('\u{f7f}', '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}', - '\u{f87}', GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend), + ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control), + ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', + GC_Control), ('\u{ad}', '\u{ad}', GC_Control), ('\u{300}', '\u{36f}', GC_Extend), + ('\u{483}', '\u{489}', GC_Extend), ('\u{591}', '\u{5bd}', GC_Extend), ('\u{5bf}', '\u{5bf}', + GC_Extend), ('\u{5c1}', '\u{5c2}', GC_Extend), ('\u{5c4}', '\u{5c5}', GC_Extend), + ('\u{5c7}', '\u{5c7}', GC_Extend), ('\u{600}', '\u{605}', GC_Prepend), ('\u{610}', + '\u{61a}', GC_Extend), ('\u{61c}', '\u{61c}', GC_Control), ('\u{64b}', '\u{65f}', + GC_Extend), ('\u{670}', '\u{670}', GC_Extend), ('\u{6d6}', '\u{6dc}', GC_Extend), + ('\u{6dd}', '\u{6dd}', GC_Prepend), ('\u{6df}', '\u{6e4}', GC_Extend), ('\u{6e7}', + '\u{6e8}', GC_Extend), ('\u{6ea}', '\u{6ed}', GC_Extend), ('\u{70f}', '\u{70f}', + GC_Prepend), ('\u{711}', '\u{711}', GC_Extend), ('\u{730}', '\u{74a}', GC_Extend), + ('\u{7a6}', '\u{7b0}', GC_Extend), ('\u{7eb}', '\u{7f3}', GC_Extend), ('\u{816}', '\u{819}', + GC_Extend), ('\u{81b}', '\u{823}', GC_Extend), ('\u{825}', '\u{827}', GC_Extend), + ('\u{829}', '\u{82d}', GC_Extend), ('\u{859}', '\u{85b}', GC_Extend), ('\u{8d4}', '\u{8e1}', + GC_Extend), ('\u{8e2}', '\u{8e2}', GC_Prepend), ('\u{8e3}', '\u{902}', GC_Extend), + ('\u{903}', '\u{903}', GC_SpacingMark), ('\u{93a}', '\u{93a}', GC_Extend), ('\u{93b}', + '\u{93b}', GC_SpacingMark), ('\u{93c}', '\u{93c}', GC_Extend), ('\u{93e}', '\u{940}', + GC_SpacingMark), ('\u{941}', '\u{948}', GC_Extend), ('\u{949}', '\u{94c}', GC_SpacingMark), + ('\u{94d}', '\u{94d}', GC_Extend), ('\u{94e}', '\u{94f}', GC_SpacingMark), ('\u{951}', + '\u{957}', GC_Extend), ('\u{962}', '\u{963}', GC_Extend), ('\u{981}', '\u{981}', GC_Extend), + ('\u{982}', '\u{983}', GC_SpacingMark), ('\u{9bc}', '\u{9bc}', GC_Extend), ('\u{9be}', + '\u{9be}', GC_Extend), ('\u{9bf}', '\u{9c0}', GC_SpacingMark), ('\u{9c1}', '\u{9c4}', + GC_Extend), ('\u{9c7}', '\u{9c8}', GC_SpacingMark), ('\u{9cb}', '\u{9cc}', GC_SpacingMark), + ('\u{9cd}', '\u{9cd}', GC_Extend), ('\u{9d7}', '\u{9d7}', GC_Extend), ('\u{9e2}', '\u{9e3}', + GC_Extend), ('\u{a01}', '\u{a02}', GC_Extend), ('\u{a03}', '\u{a03}', GC_SpacingMark), + ('\u{a3c}', '\u{a3c}', GC_Extend), ('\u{a3e}', '\u{a40}', GC_SpacingMark), ('\u{a41}', + '\u{a42}', GC_Extend), ('\u{a47}', '\u{a48}', GC_Extend), ('\u{a4b}', '\u{a4d}', GC_Extend), + ('\u{a51}', '\u{a51}', GC_Extend), ('\u{a70}', '\u{a71}', GC_Extend), ('\u{a75}', '\u{a75}', + GC_Extend), ('\u{a81}', '\u{a82}', GC_Extend), ('\u{a83}', '\u{a83}', GC_SpacingMark), + ('\u{abc}', '\u{abc}', GC_Extend), ('\u{abe}', '\u{ac0}', GC_SpacingMark), ('\u{ac1}', + '\u{ac5}', GC_Extend), ('\u{ac7}', '\u{ac8}', GC_Extend), ('\u{ac9}', '\u{ac9}', + GC_SpacingMark), ('\u{acb}', '\u{acc}', GC_SpacingMark), ('\u{acd}', '\u{acd}', GC_Extend), + ('\u{ae2}', '\u{ae3}', GC_Extend), ('\u{b01}', '\u{b01}', GC_Extend), ('\u{b02}', '\u{b03}', + GC_SpacingMark), ('\u{b3c}', '\u{b3c}', GC_Extend), ('\u{b3e}', '\u{b3f}', GC_Extend), + ('\u{b40}', '\u{b40}', GC_SpacingMark), ('\u{b41}', '\u{b44}', GC_Extend), ('\u{b47}', + '\u{b48}', GC_SpacingMark), ('\u{b4b}', '\u{b4c}', GC_SpacingMark), ('\u{b4d}', '\u{b4d}', + GC_Extend), ('\u{b56}', '\u{b57}', GC_Extend), ('\u{b62}', '\u{b63}', GC_Extend), + ('\u{b82}', '\u{b82}', GC_Extend), ('\u{bbe}', '\u{bbe}', GC_Extend), ('\u{bbf}', '\u{bbf}', + GC_SpacingMark), ('\u{bc0}', '\u{bc0}', GC_Extend), ('\u{bc1}', '\u{bc2}', GC_SpacingMark), + ('\u{bc6}', '\u{bc8}', GC_SpacingMark), ('\u{bca}', '\u{bcc}', GC_SpacingMark), ('\u{bcd}', + '\u{bcd}', GC_Extend), ('\u{bd7}', '\u{bd7}', GC_Extend), ('\u{c00}', '\u{c00}', GC_Extend), + ('\u{c01}', '\u{c03}', GC_SpacingMark), ('\u{c3e}', '\u{c40}', GC_Extend), ('\u{c41}', + '\u{c44}', GC_SpacingMark), ('\u{c46}', '\u{c48}', GC_Extend), ('\u{c4a}', '\u{c4d}', + GC_Extend), ('\u{c55}', '\u{c56}', GC_Extend), ('\u{c62}', '\u{c63}', GC_Extend), + ('\u{c81}', '\u{c81}', GC_Extend), ('\u{c82}', '\u{c83}', GC_SpacingMark), ('\u{cbc}', + '\u{cbc}', GC_Extend), ('\u{cbe}', '\u{cbe}', GC_SpacingMark), ('\u{cbf}', '\u{cbf}', + GC_Extend), ('\u{cc0}', '\u{cc1}', GC_SpacingMark), ('\u{cc2}', '\u{cc2}', GC_Extend), + ('\u{cc3}', '\u{cc4}', GC_SpacingMark), ('\u{cc6}', '\u{cc6}', GC_Extend), ('\u{cc7}', + '\u{cc8}', GC_SpacingMark), ('\u{cca}', '\u{ccb}', GC_SpacingMark), ('\u{ccc}', '\u{ccd}', + GC_Extend), ('\u{cd5}', '\u{cd6}', GC_Extend), ('\u{ce2}', '\u{ce3}', GC_Extend), + ('\u{d01}', '\u{d01}', GC_Extend), ('\u{d02}', '\u{d03}', GC_SpacingMark), ('\u{d3e}', + '\u{d3e}', GC_Extend), ('\u{d3f}', '\u{d40}', GC_SpacingMark), ('\u{d41}', '\u{d44}', + GC_Extend), ('\u{d46}', '\u{d48}', GC_SpacingMark), ('\u{d4a}', '\u{d4c}', GC_SpacingMark), + ('\u{d4d}', '\u{d4d}', GC_Extend), ('\u{d4e}', '\u{d4e}', GC_Prepend), ('\u{d57}', + '\u{d57}', GC_Extend), ('\u{d62}', '\u{d63}', GC_Extend), ('\u{d82}', '\u{d83}', + GC_SpacingMark), ('\u{dca}', '\u{dca}', GC_Extend), ('\u{dcf}', '\u{dcf}', GC_Extend), + ('\u{dd0}', '\u{dd1}', GC_SpacingMark), ('\u{dd2}', '\u{dd4}', GC_Extend), ('\u{dd6}', + '\u{dd6}', GC_Extend), ('\u{dd8}', '\u{dde}', GC_SpacingMark), ('\u{ddf}', '\u{ddf}', + GC_Extend), ('\u{df2}', '\u{df3}', GC_SpacingMark), ('\u{e31}', '\u{e31}', GC_Extend), + ('\u{e33}', '\u{e33}', GC_SpacingMark), ('\u{e34}', '\u{e3a}', GC_Extend), ('\u{e47}', + '\u{e4e}', GC_Extend), ('\u{eb1}', '\u{eb1}', GC_Extend), ('\u{eb3}', '\u{eb3}', + GC_SpacingMark), ('\u{eb4}', '\u{eb9}', GC_Extend), ('\u{ebb}', '\u{ebc}', GC_Extend), + ('\u{ec8}', '\u{ecd}', GC_Extend), ('\u{f18}', '\u{f19}', GC_Extend), ('\u{f35}', '\u{f35}', + GC_Extend), ('\u{f37}', '\u{f37}', GC_Extend), ('\u{f39}', '\u{f39}', GC_Extend), + ('\u{f3e}', '\u{f3f}', GC_SpacingMark), ('\u{f71}', '\u{f7e}', GC_Extend), ('\u{f7f}', + '\u{f7f}', GC_SpacingMark), ('\u{f80}', '\u{f84}', GC_Extend), ('\u{f86}', '\u{f87}', + GC_Extend), ('\u{f8d}', '\u{f97}', GC_Extend), ('\u{f99}', '\u{fbc}', GC_Extend), ('\u{fc6}', '\u{fc6}', GC_Extend), ('\u{102d}', '\u{1030}', GC_Extend), ('\u{1031}', '\u{1031}', GC_SpacingMark), ('\u{1032}', '\u{1037}', GC_Extend), ('\u{1039}', '\u{103a}', GC_Extend), ('\u{103b}', '\u{103c}', GC_SpacingMark), ('\u{103d}', '\u{103e}', GC_Extend), @@ -868,7 +872,7 @@ pub mod word { pub use self::WordCat::*; #[allow(non_camel_case_types)] - #[derive(Clone, Copy, PartialEq, Eq, Debug)] + #[derive(Clone, Copy, PartialEq, Eq)] pub enum WordCat { WC_ALetter, WC_Any, diff --git a/src/test.rs b/src/test.rs index 3c43574..54493fe 100644 --- a/src/test.rs +++ b/src/test.rs @@ -34,6 +34,10 @@ fn test_graphemes() { // family emoji (more than two emoji joined by ZWJ) ("\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}", &["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"]), + // cartwheel emoji followed by two fitzpatrick skin tone modifiers + // (test case from issue #19) + ("\u{1F938}\u{1F3FE}\u{1F3FE}", + &["\u{1F938}\u{1F3FE}", "\u{1F3FE}"]), ]; for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {