Skip to content

Commit

Permalink
Find line breaks according to the Unicode line breaking algorithm
Browse files Browse the repository at this point in the history
This adds a new optional dependency on the unicode-linebreak crate,
which implements the line breaking algorithm from [Unicode Standard
Annex #14](https://www.unicode.org/reports/tr14/).

The new dependency is enabled by default since these line breaks are
more correct than what you get by splitting on whitespace.

This should help address #220 and #80, though I’m no expert on
non-Western languages. More feedback from the community would be
needed here.
  • Loading branch information
mgeisler committed Apr 8, 2021
1 parent 3b77e9c commit 33ac40c
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 38 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ harness = false
path = "benches/linear.rs"

[features]
default = ["unicode-width", "smawk"]
default = ["unicode-linebreak", "unicode-width", "smawk"]

[dependencies]
hyphenation = { version = "0.8", optional = true, features = ["embed_en-us"] }
smawk = { version = "0.3", optional = true }
terminal_size = { version = "0.1", optional = true }
unicode-linebreak = { version = "0.1", optional = true }
unicode-width = { version= "0.1", optional = true }

[dev-dependencies]
Expand Down
248 changes: 228 additions & 20 deletions src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ const CSI: (char, char) = ('\x1b', '[');
/// The final bytes of an ANSI escape sequence must be in this range.
const ANSI_FINAL_BYTE: std::ops::RangeInclusive<char> = '\x40'..='\x7e';

/// Soft hyphen, also knows as a “shy hyphen”. Should show up as ‘-’
/// if a line is broken at this point, and otherwise be invisible.
/// Textwrap does not currently support breaking words at soft
/// hyphens.
#[cfg(feature = "unicode-linebreak")]
const SHY: char = '\u{00ad}';

/// Skip ANSI escape sequences. The `ch` is the current `char`, the
/// `chars` provide the following characters. The `chars` will be
/// modified if `ch` is the start of an ANSI escape sequence.
Expand Down Expand Up @@ -237,6 +244,16 @@ impl std::ops::Deref for Word<'_> {

impl<'a> Word<'a> {
/// Construct a new `Word`.
pub fn new(word: &'a str, width: usize, whitespace: &'a str, penalty: &'a str) -> Self {
Word {
word,
width,
whitespace,
penalty,
}
}

/// Construct a `Word` from a string.
///
/// A trailing stretch of `' '` is automatically taken to be the
/// whitespace part of the word.
Expand Down Expand Up @@ -326,19 +343,82 @@ impl Fragment for Word<'_> {
}
}

/// Split line into words separated by regions of `' '` characters.
/// Split `line` into words according to the `line_break_algorithm`.
///
/// The simplest way to find words is simply to split the text on
/// regions of space characters (`' '`). This works for Western
/// languages without emojis and similar non-ASCII characters. A more
/// complex way of finding words is to use the Unicode line breaking
/// algorithm described in the [Unicode Standard Annex
/// #14](https://www.unicode.org/reports/tr14/). This algorithm finds
/// break points in non-ASCII text.
///
/// # Examples
///
/// Finding words by whitespace:
///
/// ```
/// use textwrap::core::{find_words, Fragment, Word};
/// let words = find_words("Hello World!").collect::<Vec<_>>();
/// use textwrap::core::{find_words, Fragment, LineBreakAlgorithm, Word};
///
/// let words = find_words("Hello World!", LineBreakAlgorithm::Whitespace);
/// assert_eq!(words, vec![Word::from("Hello "), Word::from("World!")]);
/// assert_eq!(words[0].width(), 5);
/// assert_eq!(words[0].whitespace_width(), 1);
/// assert_eq!(words[0].penalty_width(), 0);
/// ```
pub fn find_words(line: &str) -> impl Iterator<Item = Word> {
///
/// Using the Unicode line breaking algorithm, we can find line break
/// opportunities between characters with no interspaced whitespace:
///
/// ```
/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
///
/// #[cfg(feature = "unicode-linebreak")]
/// assert_eq!(find_words("Emojis: 😂😍", LineBreakAlgorithm::UnicodeLineBreaks),
/// vec![Word::from("Emojis: "),
/// Word::from("😂"),
/// Word::from("😍")]);
///
/// #[cfg(feature = "unicode-linebreak")]
/// assert_eq!(find_words("CJK: 你好", LineBreakAlgorithm::UnicodeLineBreaks),
/// vec![Word::from("CJK: "),
/// Word::from("你"),
/// Word::from("好")]);
/// ```
///
/// A U+2060 Word Joiner character can be inserted if you want to
/// manually override the defaults and keep the characters together:
///
/// ```
/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
///
/// #[cfg(feature = "unicode-linebreak")]
/// assert_eq!(find_words("Emojis: 😂\u{2060}😍", LineBreakAlgorithm::UnicodeLineBreaks),
/// vec![Word::from("Emojis: "),
/// Word::from("😂\u{2060}😍")]);
/// ```
///
/// The Unicode line breaking algorithm will suppress break breaks
/// around certain punctuation characters::
///
/// ```
/// use textwrap::core::{find_words, LineBreakAlgorithm, Word};
///
/// #[cfg(feature = "unicode-linebreak")]
/// assert_eq!(find_words("[ foo ] bar !", LineBreakAlgorithm::UnicodeLineBreaks),
/// vec![Word::from("[ foo ] "),
/// Word::from("bar !")]);
/// ```
pub fn find_words(line: &str, line_break_algorithm: LineBreakAlgorithm) -> Vec<Word> {
match line_break_algorithm {
LineBreakAlgorithm::Whitespace => find_ascii_words(line).collect::<Vec<_>>(),
#[cfg(feature = "unicode-linebreak")]
LineBreakAlgorithm::UnicodeLineBreaks => find_unicode_words(line).collect::<Vec<_>>(),
}
}

/// Split `line` into words separated by regions of `' '` characters.
fn find_ascii_words(line: &str) -> impl Iterator<Item = Word> {
let mut start = 0;
let mut in_whitespace = false;
let mut char_indices = line.char_indices();
Expand Down Expand Up @@ -371,6 +451,86 @@ pub fn find_words(line: &str) -> impl Iterator<Item = Word> {
})
}

/// Split `line` into words using the Unicode line breaking algorithm.
#[cfg(feature = "unicode-linebreak")]
fn find_unicode_words(line: &str) -> impl Iterator<Item = Word> + '_ {
// Strip all ANSI escape sequences from `text`.
fn strip_ansi_escape_sequences(text: &str) -> String {
let mut result = String::with_capacity(text.len());

let mut chars = text.chars();
while let Some(ch) = chars.next() {
if skip_ansi_escape_sequence(ch, &mut chars) {
continue;
}
result.push(ch);
}

result
}

// Construct an iterator over (original index, stripped index)
// tuples. We find the Unicode linebreaks on a stripped string,
// but we need the original indices so we can form words based on
// the original string.
let mut last_stripped_idx = 0;
let mut char_indices = line.char_indices();
let mut idx_map = std::iter::from_fn(move || match char_indices.next() {
Some((orig_idx, ch)) => {
let stripped_idx = last_stripped_idx;
if !skip_ansi_escape_sequence(ch, &mut char_indices.by_ref().map(|(_, ch)| ch)) {
last_stripped_idx += ch.len_utf8();
}
Some((orig_idx, stripped_idx))
}
None => None,
});

let stripped = strip_ansi_escape_sequences(&line);
let mut opportunities = unicode_linebreak::linebreaks(&stripped)
.filter(|(idx, _)| {
#[allow(clippy::match_like_matches_macro)]
match &line[..*idx].chars().next_back() {
// We suppress breaks at ‘-’ since we want to control
// this via the WordSplitter.
Some('-') => false,
// Soft hyphens are currently not supported since we
// require all `Word` fragments to be continuous in
// the input string.
Some(SHY) => false,
// Other breaks should be fine!
_ => true,
}
})
.collect::<Vec<_>>()
.into_iter();

// Remove final break opportunity, we will add it below using
// &line[start..]; This ensures that we correctly include a
// trailing ANSI escape sequence.
opportunities.next_back();

let mut start = 0;
std::iter::from_fn(move || {
#[allow(clippy::while_let_on_iterator)]
while let Some((idx, _)) = opportunities.next() {
if let Some((orig_idx, _)) = idx_map.find(|&(_, stripped_idx)| stripped_idx == idx) {
let word = Word::from(&line[start..orig_idx]);
start = orig_idx;
return Some(word);
}
}

if start < line.len() {
let word = Word::from(&line[start..]);
start = line.len();
return Some(word);
}

None
})
}

/// Split words into smaller words according to the split points given
/// by `options`.
///
Expand Down Expand Up @@ -458,6 +618,27 @@ where
shortened_words
}

/// Line breaking algorithm.
///
/// This determines where words are found in lines: either after
/// spaces and newlines (the `Whitespace` variant) or using rules
/// which take Unicode character classes into account (the
/// `UnicodeLineBreaks` variant).
///
/// See the [`find_words`] function for examples.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum LineBreakAlgorithm {
/// Allow lines to be broken at `'\n'` and `' '` characters.
Whitespace,
/// Allow lines to be broken according to the Unicode line
/// breaking algorithm.
///
/// **Note:** Only available when the `unicode-linebreaks` Cargo
/// feature is enabled.
#[cfg(feature = "unicode-linebreak")]
UnicodeLineBreaks,
}

/// Wrapping algorithms.
///
/// After a text has been broken into [`Fragment`]s, the one now has
Expand Down Expand Up @@ -510,7 +691,7 @@ pub enum WrapAlgorithm {
/// a large gap:
///
/// ```
/// use textwrap::core::{find_words, wrap_first_fit, Word};
/// use textwrap::core::{find_words, wrap_first_fit, LineBreakAlgorithm, Word};
///
/// // Helper to convert wrapped lines to a Vec<String>.
/// fn lines_to_strings(lines: Vec<&[Word<'_>]>) -> Vec<String> {
Expand All @@ -520,7 +701,7 @@ pub enum WrapAlgorithm {
/// }
///
/// let text = "These few words will unfortunately not wrap nicely.";
/// let words = find_words(text).collect::<Vec<_>>();
/// let words = find_words(text, LineBreakAlgorithm::Whitespace);
/// assert_eq!(lines_to_strings(wrap_first_fit(&words, |_| 15)),
/// vec!["These few words",
/// "will", // <-- short line
Expand Down Expand Up @@ -750,81 +931,108 @@ mod tests {

#[test]
fn find_words_empty() {
assert_iter_eq!(find_words(""), vec![]);
assert_iter_eq!(find_ascii_words(""), vec![]);
}

#[test]
fn find_words_single_word() {
assert_iter_eq!(find_words("foo"), vec![Word::from("foo")]);
assert_iter_eq!(find_ascii_words("foo"), vec![Word::from("foo")]);
}

#[test]
fn find_words_two_words() {
assert_iter_eq!(
find_words("foo bar"),
find_ascii_words("foo bar"),
vec![Word::from("foo "), Word::from("bar")]
);
}

#[test]
fn find_words_multiple_words() {
assert_iter_eq!(
find_words("foo bar baz"),
find_ascii_words("foo bar baz"),
vec![Word::from("foo "), Word::from("bar "), Word::from("baz")]
);
}

#[test]
fn find_words_whitespace() {
assert_iter_eq!(find_words(" "), vec![Word::from(" ")]);
fn find_words_only_whitespace() {
assert_iter_eq!(find_ascii_words(" "), vec![Word::from(" ")]);
}

#[test]
fn find_words_inter_word_whitespace() {
assert_iter_eq!(
find_words("foo bar"),
find_ascii_words("foo bar"),
vec![Word::from("foo "), Word::from("bar")]
)
}

#[test]
fn find_words_trailing_whitespace() {
assert_iter_eq!(find_words("foo "), vec![Word::from("foo ")]);
assert_iter_eq!(find_ascii_words("foo "), vec![Word::from("foo ")]);
}

#[test]
fn find_words_leading_whitespace() {
assert_iter_eq!(
find_words(" foo"),
find_ascii_words(" foo"),
vec![Word::from(" "), Word::from("foo")]
);
}

#[test]
fn find_words_multi_column_char() {
assert_iter_eq!(
find_words("\u{1f920}"), // cowboy emoji 🤠
find_ascii_words("\u{1f920}"), // cowboy emoji 🤠
vec![Word::from("\u{1f920}")]
);
}

#[test]
fn find_words_hyphens() {
assert_iter_eq!(find_words("foo-bar"), vec![Word::from("foo-bar")]);
assert_iter_eq!(find_ascii_words("foo-bar"), vec![Word::from("foo-bar")]);
assert_iter_eq!(
find_words("foo- bar"),
find_ascii_words("foo- bar"),
vec![Word::from("foo- "), Word::from("bar")]
);
assert_iter_eq!(
find_words("foo - bar"),
find_ascii_words("foo - bar"),
vec![Word::from("foo "), Word::from("- "), Word::from("bar")]
);
assert_iter_eq!(
find_words("foo -bar"),
find_ascii_words("foo -bar"),
vec![Word::from("foo "), Word::from("-bar")]
);
}

#[test]
fn find_words_colored_text() {
use termion::color::{Blue, Fg, Green, Reset};

let green_hello = format!("{}Hello{} ", Fg(Green), Fg(Reset));
let blue_world = format!("{}World!{}", Fg(Blue), Fg(Reset));

assert_iter_eq!(
find_ascii_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);

#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(
find_unicode_words(&format!("{}{}", green_hello, blue_world)),
vec![Word::from(&green_hello), Word::from(&blue_world)]
);
}

#[test]
fn find_words_color_inside_word() {
let text = "foo\u{1b}[0m\u{1b}[32mbar\u{1b}[0mbaz";
assert_iter_eq!(find_ascii_words(&text), vec![Word::from(text)]);
#[cfg(feature = "unicode-linebreak")]
assert_iter_eq!(find_unicode_words(&text), vec![Word::from(text)]);
}

#[test]
fn split_words_no_words() {
assert_iter_eq!(split_words(vec![], 80), vec![]);
Expand Down
Loading

0 comments on commit 33ac40c

Please sign in to comment.