Skip to content

Commit

Permalink
Use Unicode line breaking algorithm to find words
Browse files Browse the repository at this point in the history
This adds a new optional dependency on the unicode-linebreak crate,
which implements the line breaking algorithm from [Unicode Standard
Annex #14](https://www.unicode.org/reports/tr14/). We can use this to
find words in non-ASCII text.

The new dependency is enabled by default since these line breaks are
more correct than what you get by splitting on ASCII space.

This should help address #220 and #80, though I’m no expert on
non-Western languages. More feedback from the community would be
needed here.
  • Loading branch information
mgeisler committed May 2, 2021
1 parent 48b9480 commit ecbbde4
Show file tree
Hide file tree
Showing 9 changed files with 332 additions and 41 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ harness = false
path = "benches/linear.rs"

[features]
default = ["unicode-width", "smawk"]
default = ["unicode-linebreak", "unicode-width", "smawk"]

[dependencies]
smawk = { version = "0.3", optional = true }
terminal_size = { version = "0.1", optional = true }
unicode-linebreak = { version = "0.1", optional = true }
unicode-width = { version= "0.1", optional = true }

[dependencies.hyphenation]
Expand Down
22 changes: 19 additions & 3 deletions benches/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,25 @@ pub fn benchmark(c: &mut Criterion) {

#[cfg(feature = "smawk")]
{
#[cfg(feature = "unicode-linebreak")]
{
let options = textwrap::Options::new(LINE_LENGTH)
.wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit)
.word_separator(textwrap::UnicodeBreakProperties);
group.bench_with_input(
BenchmarkId::new("fill_optimal_fit_unicode", length),
&text,
|b, text| {
b.iter(|| textwrap::fill(text, &options));
},
);
}

let options = textwrap::Options::new(LINE_LENGTH)
.wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit);
.wrap_algorithm(textwrap::core::WrapAlgorithm::OptimalFit)
.word_separator(textwrap::AsciiSpace);
group.bench_with_input(
BenchmarkId::new("fill_optimal_fit", length),
BenchmarkId::new("fill_optimal_fit_ascii", length),
&text,
|b, text| {
b.iter(|| textwrap::fill(text, &options));
Expand All @@ -38,7 +53,8 @@ pub fn benchmark(c: &mut Criterion) {
}

let options = textwrap::Options::new(LINE_LENGTH)
.wrap_algorithm(textwrap::core::WrapAlgorithm::FirstFit);
.wrap_algorithm(textwrap::core::WrapAlgorithm::FirstFit)
.word_separator(textwrap::AsciiSpace);
group.bench_with_input(
BenchmarkId::new("fill_first_fit", length),
&text,
Expand Down
9 changes: 5 additions & 4 deletions examples/interactive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ mod unix_only {
use termion::{color, cursor, style};
#[cfg(feature = "smawk")]
use textwrap::core::WrapAlgorithm::{FirstFit, OptimalFit};
use textwrap::{wrap, AsciiSpace, Options};
use textwrap::{wrap, AsciiSpace, Options, WordSeparator};
use textwrap::{HyphenSplitter, NoHyphenation, WordSplitter};

#[cfg(feature = "hyphenation")]
Expand Down Expand Up @@ -57,7 +57,7 @@ mod unix_only {

fn draw_text<'a>(
text: &str,
options: &Options<'a, AsciiSpace, Box<dyn WordSplitter>>,
options: &Options<'a, Box<dyn WordSeparator>, Box<dyn WordSplitter>>,
splitter_label: &str,
stdout: &mut RawTerminal<io::Stdout>,
) -> Result<(), io::Error> {
Expand Down Expand Up @@ -257,8 +257,9 @@ mod unix_only {
}

let mut label = labels.pop().unwrap();
let mut options =
Options::new(35).splitter(Box::new(HyphenSplitter) as Box<dyn WordSplitter>);
let mut options = Options::new(35)
.splitter(Box::new(HyphenSplitter) as Box<dyn WordSplitter>)
.word_separator(Box::new(AsciiSpace) as Box<dyn WordSeparator>);
options.break_words = false;
options.splitter = splitters.pop().unwrap();

Expand Down
44 changes: 44 additions & 0 deletions examples/wasm/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/wasm/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use wasm_bindgen::prelude::*;
use wasm_bindgen::JsCast;

use textwrap::core;
use textwrap::{core, WordSeparator};

#[wasm_bindgen]
extern "C" {
Expand Down
6 changes: 3 additions & 3 deletions src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl std::ops::Deref for Word<'_> {
}

impl<'a> Word<'a> {
/// Construct a new `Word`.
/// Construct a `Word` from a string.
///
/// A trailing stretch of `' '` is automatically taken to be the
/// whitespace part of the word.
Expand Down Expand Up @@ -354,9 +354,9 @@ impl Fragment for Word<'_> {
/// vec![Word::from("foo-bar")]
/// );
/// ```
pub fn split_words<'a, I, T, S>(
pub fn split_words<'a, I, R, S>(
words: I,
options: &'a Options<'a, T, S>,
options: &'a Options<'a, R, S>,
) -> impl Iterator<Item = Word<'a>>
where
I: IntoIterator<Item = Word<'a>>,
Expand Down
Loading

0 comments on commit ecbbde4

Please sign in to comment.