Find line breaks according to the Unicode line breaking algorithm

This adds a new optional dependency on the unicode-linebreak crate, which implements the line breaking algorithm from [Unicode Standard Annex #14](https://www.unicode.org/reports/tr14/). The new dependency is enabled by default since these line breaks are more correct than what you get by splitting on whitespace. This should help address #220 and #80, though I’m no expert on non-Western languages. More feedback from the community would be needed here.
mgeisler · Apr 8, 2021 · 33ac40c · 33ac40c
1 parent 3b77e9c
commit 33ac40c
Show file tree

Hide file tree

Showing 3 changed files with 302 additions and 38 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,12 +21,13 @@ harness = false
 path = "benches/linear.rs"
 
 [features]
-default = ["unicode-width", "smawk"]
+default = ["unicode-linebreak", "unicode-width", "smawk"]
 
 [dependencies]
 hyphenation = { version = "0.8", optional = true, features = ["embed_en-us"] }
 smawk = { version = "0.3", optional = true }
 terminal_size = { version = "0.1", optional = true }
+unicode-linebreak = { version = "0.1", optional = true }
 unicode-width = { version= "0.1", optional = true }
 
 [dev-dependencies]

diff --git a/src/core.rs b/src/core.rs
@@ -44,6 +44,13 @@ const CSI: (char, char) = ('\x1b', '[');
 /// The final bytes of an ANSI escape sequence must be in this range.
 const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e';
 
+/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
+/// if a line is broken at this point, and otherwise be invisible.
+/// Textwrap does not currently support breaking words at soft
+/// hyphens.
+#[cfg(feature = "unicode-linebreak")]
+const SHY: char = '\u{00ad}';
+
 /// Skip ANSI escape sequences. The `ch` is the current `char`, the
 /// `chars` provide the following characters. The `chars` will be
 /// modified if `ch` is the start of an ANSI escape sequence.
@@ -237,6 +244,16 @@ impl std::ops::Deref for Word<'_> {
 
 impl<'a> Word<'a> {
     /// Construct a new `Word`.
+    pub fn new(word: &'a str, width: usize, whitespace: &'a str, penalty: &'a str) -> Self {
+        Word {
+            word,
+            width,
+            whitespace,
+            penalty,
+        }
+    }
+
+    /// Construct a `Word` from a string.
     ///
     /// A trailing stretch of `' '` is automatically taken to be the
     /// whitespace part of the word.
@@ -326,19 +343,82 @@ impl Fragment for Word<'_> {
     }
 }
 
-/// Split line into words separated by regions of `' '` characters.
+/// Split `line` into words according to the `line_break_algorithm`.
+///
+/// The simplest way to find words is simply to split the text on
+/// regions of space characters (`' '`). This works for Western
+/// languages without emojis and similar non-ASCII characters. A more
+/// complex way of finding words is to use the Unicode line breaking
+/// algorithm described in the [Unicode Standard Annex
+/// #14](https://www.unicode.org/reports/tr14/). This algorithm finds
+/// break points in non-ASCII text.
 ///
 /// # Examples
 ///
+/// Finding words by whitespace:
+///
 /// ```
-/// use textwrap::core::{find_words, Fragment, Word};
-/// let words = find_words("Hello World!").collect::<Vec<_>>();
+/// use textwrap::core::{find_words, Fragment, LineBreakAlgorithm, Word};
+///
+/// let words = find_words("Hello World!", LineBreakAlgorithm::Whitespace);
 /// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
 /// assert_eq!(words[0].width(), 5);
 /// assert_eq!(words[0].whitespace_width(), 1);
 /// assert_eq!(words[0].penalty_width(), 0);
 /// ```
-pub fn find_words(line: &str) -> impl Iterator<Item = Word> {
+///
+/// Using the Unicode line breaking algorithm, we can find line break
+/// opportunities between characters with no interspaced whitespace:
+///
+/// ```
+/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
+///
+/// #[cfg(feature = "unicode-linebreak")]
+/// assert_eq!(find_words("Emojis: 😂😍", LineBreakAlgorithm::UnicodeLineBreaks),
+///            vec![Word::from("Emojis: "),
+///                 Word::from("😂"),
+///                 Word::from("😍")]);
+///
+/// #[cfg(feature = "unicode-linebreak")]
+/// assert_eq!(find_words("CJK: 你好", LineBreakAlgorithm::UnicodeLineBreaks),
+///            vec![Word::from("CJK: "),
+///                 Word::from("你"),
+///                 Word::from("好")]);
+/// ```
+///
+/// A U+2060 Word Joiner character can be inserted if you want to
+/// manually override the defaults and keep the characters together:
+///
+/// ```
+/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
+///
+/// #[cfg(feature = "unicode-linebreak")]
+/// assert_eq!(find_words("Emojis: 😂\u{2060}😍", LineBreakAlgorithm::UnicodeLineBreaks),
+///            vec![Word::from("Emojis: "),
+///                 Word::from("😂\u{2060}😍")]);
+/// ```
+///
+/// The Unicode line breaking algorithm will suppress break breaks
+/// around certain punctuation characters::
+///
+/// ```
+/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
+///
+/// #[cfg(feature = "unicode-linebreak")]
+/// assert_eq!(find_words("[ foo ] bar !", LineBreakAlgorithm::UnicodeLineBreaks),
+///            vec![Word::from("[ foo ] "),
+///                 Word::from("bar !")]);
+/// ```
+pub fn find_words(line: &str, line_break_algorithm: LineBreakAlgorithm) -> Vec<Word> {
+    match line_break_algorithm {
+        LineBreakAlgorithm::Whitespace => find_ascii_words(line).collect::<Vec<_>>(),
+        #[cfg(feature = "unicode-linebreak")]
+        LineBreakAlgorithm::UnicodeLineBreaks => find_unicode_words(line).collect::<Vec<_>>(),
+    }
+}
+
+/// Split `line` into words separated by regions of `' '` characters.
+fn find_ascii_words(line: &str) -> impl Iterator<Item = Word> {
     let mut start = 0;
     let mut in_whitespace = false;
     let mut char_indices = line.char_indices();
@@ -371,6 +451,86 @@ pub fn find_words(line: &str) -> impl Iterator<Item = Word> {
     })
 }
 
+/// Split `line` into words using the Unicode line breaking algorithm.
+#[cfg(feature = "unicode-linebreak")]
+fn find_unicode_words(line: &str) -> impl Iterator<Item = Word> + '_ {
+    // Strip all ANSI escape sequences from `text`.
+    fn strip_ansi_escape_sequences(text: &str) -> String {
+        let mut result = String::with_capacity(text.len());
+
+        let mut chars = text.chars();
+        while let Some(ch) = chars.next() {
+            if skip_ansi_escape_sequence(ch, &mut chars) {
+                continue;
+            }
+            result.push(ch);
+        }
+
+        result
+    }
+
+    // Construct an iterator over (original index, stripped index)
+    // tuples. We find the Unicode linebreaks on a stripped string,
+    // but we need the original indices so we can form words based on
+    // the original string.
+    let mut last_stripped_idx = 0;
+    let mut char_indices = line.char_indices();
+    let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
+        Some((orig_idx, ch)) => {
+            let stripped_idx = last_stripped_idx;
+            if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
+                last_stripped_idx += ch.len_utf8();
+            }
+            Some((orig_idx, stripped_idx))
+        }
+        None => None,
+    });
+
+    let stripped = strip_ansi_escape_sequences(&line);
+    let mut opportunities = unicode_linebreak::linebreaks(&stripped)
+        .filter(|(idx, _)| {
+            #[allow(clippy::match_like_matches_macro)]
+            match &line[..*idx].chars().next_back() {
+                // We suppress breaks at ‘-’ since we want to control
+                // this via the WordSplitter.
+                Some('-') => false,
+                // Soft hyphens are currently not supported since we
+                // require all `Word` fragments to be continuous in
+                // the input string.
+                Some(SHY) => false,
+                // Other breaks should be fine!
+                _ => true,
+            }
+        })
+        .collect::<Vec<_>>()
+        .into_iter();
+
+    // Remove final break opportunity, we will add it below using
+    // &line[start..]; This ensures that we correctly include a
+    // trailing ANSI escape sequence.
+    opportunities.next_back();
+
+    let mut start = 0;
+    std::iter::from_fn(move || {
+        #[allow(clippy::while_let_on_iterator)]
+        while let Some((idx, _)) = opportunities.next() {
+            if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
+                let word = Word::from(&line[start..orig_idx]);
+                start = orig_idx;
+                return Some(word);
+            }
+        }
+
+        if start < line.len() {
+            let word = Word::from(&line[start..]);
+            start = line.len();
+            return Some(word);
+        }
+
+        None
+    })
+}
+
 /// Split words into smaller words according to the split points given
 /// by `options`.
 ///
@@ -458,6 +618,27 @@ where
     shortened_words
 }
 
+/// Line breaking algorithm.
+///
+/// This determines where words are found in lines: either after
+/// spaces and newlines (the `Whitespace` variant) or using rules
+/// which take Unicode character classes into account (the
+/// `UnicodeLineBreaks` variant).
+///
+/// See the [`find_words`] function for examples.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum LineBreakAlgorithm {
+    /// Allow lines to be broken at `'\n'` and `' '` characters.
+    Whitespace,
+    /// Allow lines to be broken according to the Unicode line
+    /// breaking algorithm.
+    ///
+    /// **Note:** Only available when the `unicode-linebreaks` Cargo
+    /// feature is enabled.
+    #[cfg(feature = "unicode-linebreak")]
+    UnicodeLineBreaks,
+}
+
 /// Wrapping algorithms.
 ///
 /// After a text has been broken into [`Fragment`]s, the one now has
@@ -510,7 +691,7 @@ pub enum WrapAlgorithm {
 /// a large gap:
 ///
 /// ```
-/// use textwrap::core::{find_words, wrap_first_fit, Word};
+/// use textwrap::core::{find_words, wrap_first_fit, LineBreakAlgorithm, Word};
 ///
 /// // Helper to convert wrapped lines to a Vec<String>.
 /// fn lines_to_strings(lines: Vec<&[Word<'_>]>) -> Vec<String> {
@@ -520,7 +701,7 @@ pub enum WrapAlgorithm {
 /// }
 ///
 /// let text = "These few words will unfortunately not wrap nicely.";
-/// let words = find_words(text).collect::<Vec<_>>();
+/// let words = find_words(text, LineBreakAlgorithm::Whitespace);
 /// assert_eq!(lines_to_strings(wrap_first_fit(&words, |_| 15)),
 ///            vec!["These few words",
 ///                 "will",  // <-- short line
@@ -750,81 +931,108 @@ mod tests {
 
     #[test]
     fn find_words_empty() {
-        assert_iter_eq!(find_words(""), vec![]);
+        assert_iter_eq!(find_ascii_words(""), vec![]);
     }
 
     #[test]
     fn find_words_single_word() {
-        assert_iter_eq!(find_words("foo"), vec![Word::from("foo")]);
+        assert_iter_eq!(find_ascii_words("foo"), vec![Word::from("foo")]);
     }
 
     #[test]
     fn find_words_two_words() {
         assert_iter_eq!(
-            find_words("foo bar"),
+            find_ascii_words("foo bar"),
             vec![Word::from("foo "), Word::from("bar")]
         );
     }
 
     #[test]
     fn find_words_multiple_words() {
         assert_iter_eq!(
-            find_words("foo bar baz"),
+            find_ascii_words("foo bar baz"),
             vec![Word::from("foo "), Word::from("bar "), Word::from("baz")]
         );
     }
 
     #[test]
-    fn find_words_whitespace() {
-        assert_iter_eq!(find_words("    "), vec![Word::from("    ")]);
+    fn find_words_only_whitespace() {
+        assert_iter_eq!(find_ascii_words("    "), vec![Word::from("    ")]);
     }
 
     #[test]
     fn find_words_inter_word_whitespace() {
         assert_iter_eq!(
-            find_words("foo   bar"),
+            find_ascii_words("foo   bar"),
             vec![Word::from("foo   "), Word::from("bar")]
         )
     }
 
     #[test]
     fn find_words_trailing_whitespace() {
-        assert_iter_eq!(find_words("foo   "), vec![Word::from("foo   ")]);
+        assert_iter_eq!(find_ascii_words("foo   "), vec![Word::from("foo   ")]);
     }
 
     #[test]
     fn find_words_leading_whitespace() {
         assert_iter_eq!(
-            find_words("   foo"),
+            find_ascii_words("   foo"),
             vec![Word::from("   "), Word::from("foo")]
         );
     }
 
     #[test]
     fn find_words_multi_column_char() {
         assert_iter_eq!(
-            find_words("\u{1f920}"), // cowboy emoji 🤠
+            find_ascii_words("\u{1f920}"), // cowboy emoji 🤠
             vec![Word::from("\u{1f920}")]
         );
     }
 
     #[test]
     fn find_words_hyphens() {
-        assert_iter_eq!(find_words("foo-bar"), vec![Word::from("foo-bar")]);
+        assert_iter_eq!(find_ascii_words("foo-bar"), vec![Word::from("foo-bar")]);
         assert_iter_eq!(
-            find_words("foo- bar"),
+            find_ascii_words("foo- bar"),
             vec![Word::from("foo- "), Word::from("bar")]
         );
         assert_iter_eq!(
-            find_words("foo - bar"),
+            find_ascii_words("foo - bar"),
             vec![Word::from("foo "), Word::from("- "), Word::from("bar")]
         );
         assert_iter_eq!(
-            find_words("foo -bar"),
+            find_ascii_words("foo -bar"),
             vec![Word::from("foo "), Word::from("-bar")]
         );
     }
 
+    #[test]
+    fn find_words_colored_text() {
+        use termion::color::{Blue, Fg, Green, Reset};
+
+        let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
+        let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));
+
+        assert_iter_eq!(
+            find_ascii_words(&format!("{}{}", green_hello, blue_world)),
+            vec![Word::from(&green_hello), Word::from(&blue_world)]
+        );
+
+        #[cfg(feature = "unicode-linebreak")]
+        assert_iter_eq!(
+            find_unicode_words(&format!("{}{}", green_hello, blue_world)),
+            vec![Word::from(&green_hello), Word::from(&blue_world)]
+        );
+    }
+
+    #[test]
+    fn find_words_color_inside_word() {
+        let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
+        assert_iter_eq!(find_ascii_words(&text), vec![Word::from(text)]);
+        #[cfg(feature = "unicode-linebreak")]
+        assert_iter_eq!(find_unicode_words(&text), vec![Word::from(text)]);
+    }
+
     #[test]
     fn split_words_no_words() {
         assert_iter_eq!(split_words(vec![], 80), vec![]);