From f9086ddd3e256fa5eaab57f4145fdee88588a3dc Mon Sep 17 00:00:00 2001 From: Alex Povel Date: Mon, 29 May 2023 19:08:14 +0200 Subject: [PATCH] docs: Initial version of public API surface Also fixed `pedantic` `clippy` lints --- Cargo.lock | 4 + LICENSE | 7 + README.md | 4 + common/Cargo.toml | 5 + common/src/itertools.rs | 65 ++++ common/src/lib.rs | 95 +---- .../util/iteration.rs => common/src/lookup.rs | 62 +--- ...__itertools__tests__test_power_set-[].snap | 2 +- ...s__tests__test_power_set-[_1,_2,_3,_].snap | 2 +- ...ools__tests__test_power_set-[_1,_2,_].snap | 2 +- ...ertools__tests__test_power_set-[_1,_].snap | 2 +- ...ests__test_power_set_without_empty-[].snap | 2 +- ..._power_set_without_empty-[_1,_2,_3,_].snap | 2 +- ...est_power_set_without_empty-[_1,_2,_].snap | 2 +- ...__test_power_set_without_empty-[_1,_].snap | 2 +- common/src/strings.rs | 91 +++++ core/Cargo.toml | 14 + core/benches/lookup.rs | 2 +- core/benches/threading.rs | 9 +- core/build.rs | 2 +- core/src/lib.rs | 51 ++- core/src/main.rs | 8 +- core/src/stages/german/driver.rs | 338 ++++++++++++++++-- core/src/stages/german/machine.rs | 15 +- core/src/stages/german/mod.rs | 3 +- ...tests__test_word_casing_from_string-.snap} | 2 +- ...t_word_casing_from_string-_SCREAMING.snap} | 2 +- ..._test_word_casing_from_string-_WOW!!.snap} | 2 +- ..._test_word_casing_from_string-_bItTe.snap} | 2 +- ..._test_word_casing_from_string-_dANKE.snap} | 2 +- ..._test_word_casing_from_string-_hello.snap} | 2 +- ..._test_word_casing_from_string-_uebel.snap} | 2 +- ...ord_casing_from_string-_\303\237uper.snap" | 2 +- ...word_casing_from_string-_\303\274bel.snap" | 2 +- ...g_from_string-_\341\272\236uperduper.snap" | 2 +- ...ng_from_string-_\341\272\236\303\237.snap" | 2 +- ...casing_from_string-_\360\237\230\200.snap" | 2 +- core/src/stages/german/words.rs | 95 ++++- core/src/stages/mod.rs | 38 +- core/src/stages/symbols/mod.rs | 10 +- core/src/stages/tooling.rs | 65 ++++ core/src/util/mod.rs | 2 - core/src/util/strings.rs | 90 ----- 43 files changed, 759 insertions(+), 356 deletions(-) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 common/src/itertools.rs rename core/src/util/iteration.rs => common/src/lookup.rs (71%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[].snap => common/src/snapshots/common__itertools__tests__test_power_set-[].snap (61%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_3,_].snap => common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_3,_].snap (80%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_].snap => common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_].snap (72%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_].snap => common/src/snapshots/common__itertools__tests__test_power_set-[_1,_].snap (65%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[].snap => common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[].snap (60%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_3,_].snap => common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_3,_].snap (80%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_].snap => common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_].snap (71%) rename core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_].snap => common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_].snap (63%) create mode 100644 common/src/strings.rs rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-.snap} (70%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_SCREAMING.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_SCREAMING.snap} (71%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_WOW!!.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_WOW!!.snap} (77%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_bItTe.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_bItTe.snap} (68%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_dANKE.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_dANKE.snap} (68%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_hello.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_hello.snap} (70%) rename core/src/{util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_uebel.snap => stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_uebel.snap} (70%) rename "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\237uper.snap" => "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\237uper.snap" (70%) rename "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\274bel.snap" => "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\274bel.snap" (70%) rename "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" => "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" (71%) rename "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" => "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" (69%) rename "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\360\237\230\200.snap" => "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\360\237\230\200.snap" (77%) create mode 100644 core/src/stages/tooling.rs delete mode 100644 core/src/util/mod.rs delete mode 100644 core/src/util/strings.rs diff --git a/Cargo.lock b/Cargo.lock index f6aac1c1..4506b491 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -283,8 +283,12 @@ checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" name = "common" version = "0.1.0" dependencies = [ + "insta", + "itertools", "log", + "paste", "rstest", + "serde", ] [[package]] diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..6cf20411 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright 2023 Alex Povel + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..a27553b2 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# betterletter + +Substitute alternative, ASCII-only spellings of special characters with their Unicode +equivalents. diff --git a/common/Cargo.toml b/common/Cargo.toml index b55458d1..477a2caf 100644 --- a/common/Cargo.toml +++ b/common/Cargo.toml @@ -2,11 +2,16 @@ name = "common" version = "0.1.0" edition = "2021" +publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] log = "0.4.17" +itertools = "0.10.5" +paste = "1.0.12" +serde = { version = "1.0.163", features = ["derive"] } [dev-dependencies] rstest = "0.17.0" +insta = { version = "1.29.0", features = ["yaml"] } diff --git a/common/src/itertools.rs b/common/src/itertools.rs new file mode 100644 index 00000000..1f8db0ef --- /dev/null +++ b/common/src/itertools.rs @@ -0,0 +1,65 @@ +use itertools::Itertools; + +pub fn _power_set(collection: C) -> Vec> +where + C: IntoIterator, + T: Clone, +{ + power_set_impl(collection, true) +} + +pub fn power_set_without_empty(collection: C) -> Vec> +where + C: IntoIterator, + T: Clone, +{ + power_set_impl(collection, false) +} + +fn power_set_impl(collection: C, include_empty_set: bool) -> Vec> +where + C: IntoIterator, + T: Clone, +{ + let vec = collection.into_iter().collect_vec(); + + // https://en.wikipedia.org/wiki/Power_set#Properties + let mut result = Vec::with_capacity(2usize.checked_pow(vec.len() as u32).expect("Overflow")); + + let start = if include_empty_set { 0 } else { 1 }; + + for i in start..=vec.len() { + result.extend(vec.iter().cloned().combinations(i)); + } + + result +} + +#[cfg(test)] +mod tests { + use super::{_power_set, power_set_without_empty}; + use crate::instrament; + use rstest::rstest; + + instrament! { + #[rstest] + fn test_power_set( + #[values(vec![], vec![1], vec![1, 2], vec![1, 2, 3])] + collection: Vec + ) (|data: &TestPowerSet| { + let result = _power_set(collection.clone()); + insta::assert_yaml_snapshot!(data.to_string(), result); + }) + } + + instrament! { + #[rstest] + fn test_power_set_without_empty( + #[values(vec![], vec![1], vec![1, 2], vec![1, 2, 3])] + collection: Vec + ) (|data: &TestPowerSetWithoutEmpty| { + let result = power_set_without_empty(collection.clone()); + insta::assert_yaml_snapshot!(data.to_string(), result); + }) + } +} diff --git a/common/src/lib.rs b/common/src/lib.rs index 1580de4d..1a7b6d25 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -1,93 +1,6 @@ -use log::trace; +#![allow(clippy::cargo_common_metadata)] pub mod instrament; - -pub fn titlecase(word: &str) -> String { - let mut chars = word.chars(); - let mut result = String::with_capacity(word.len()); - - if let Some(c) = chars.next() { - for upper in c.to_uppercase() { - result.push(upper); - } - } - - for c in chars { - for lower in c.to_lowercase() { - result.push(lower); - } - } - - result -} - -pub fn is_compound_word(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { - trace!("Checking if word is valid compound word: '{}'", word); - - let indices = word.char_indices().skip(1); - - // Greedily fetch the longest possible prefix. Otherwise, we short-circuit and might - // end up looking for (for example) "He" of "Heizölrechnung" and its suffix - // "izölrechnung" (not a word), whereas we could have found "Heizöl" and "Rechnung" - // instead. - let mut highest_valid_index = None; - for (i, _) in indices { - let prefix = &word[..i]; - - if predicate(prefix) { - highest_valid_index = Some(i); - } - } - - match highest_valid_index { - Some(i) => { - let suffix = &word[i..]; - - trace!( - "Prefix '{}' found in word list, seeing if suffix '{}' is valid.", - &word[..i], - suffix - ); - - predicate(&titlecase(suffix)) - || predicate(suffix) - || is_compound_word(&titlecase(suffix), predicate) - || is_compound_word(suffix, predicate) - } - None => false, - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rstest::rstest; - - #[rstest] - #[case("hello", "Hello")] - #[case("bItTe", "Bitte")] - #[case("dANKE", "Danke")] - #[case("übel", "Übel")] - #[case("uebel", "Uebel")] - #[case("😀", "😀")] - #[case("ßuper", "SSuper")] - #[case("ẞuperduper", "ẞuperduper")] - #[case("WOW!!", "Wow!!")] - #[case("ẞß", "ẞß")] - fn test_titlecase(#[case] word: &str, #[case] expected: &str) { - assert_eq!(titlecase(word), expected); - } - - const WORDS: &[&str] = &["Süßwasser", "schwimm", "Bäder", "Mauer", "Dübel", "Kübel"]; - - #[rstest] - #[case("Süßwasserschwimmbäder", true)] - #[case("Mauerdübel", true)] - #[case("Mauerdübelkübel", true)] - #[case("Not a compound word", false)] - #[case("Mauer好", false)] - #[case("Mauerdjieojoid", false)] - fn test_is_compound_word(#[case] word: &str, #[case] expected: bool) { - assert_eq!(is_compound_word(word, &|w| WORDS.contains(&w)), expected); - } -} +pub mod itertools; +pub mod lookup; +pub mod strings; diff --git a/core/src/util/iteration.rs b/common/src/lookup.rs similarity index 71% rename from core/src/util/iteration.rs rename to common/src/lookup.rs index 59753299..0ccec979 100644 --- a/core/src/util/iteration.rs +++ b/common/src/lookup.rs @@ -2,41 +2,6 @@ use itertools::Itertools; use std::cmp::Ordering; use std::str; -pub fn _power_set(collection: C) -> Vec> -where - C: IntoIterator, - T: Clone, -{ - power_set_impl(collection, true) -} - -pub fn power_set_without_empty(collection: C) -> Vec> -where - C: IntoIterator, - T: Clone, -{ - power_set_impl(collection, false) -} - -fn power_set_impl(collection: C, include_empty_set: bool) -> Vec> -where - C: IntoIterator, - T: Clone, -{ - let vec = collection.into_iter().collect_vec(); - - // https://en.wikipedia.org/wiki/Power_set#Properties - let mut result = Vec::with_capacity(2usize.checked_pow(vec.len() as u32).expect("Overflow")); - - let start = if include_empty_set { 0 } else { 1 }; - - for i in start..=vec.len() { - result.extend(vec.iter().cloned().combinations(i)); - } - - result -} - pub fn binary_search_uneven(needle: &str, haystack: &str, sep: char) -> bool { if needle.is_empty() { return true; @@ -83,32 +48,9 @@ pub fn binary_search_uneven(needle: &str, haystack: &str, sep: char) -> bool { #[cfg(test)] mod tests { - use super::{_power_set, power_set_without_empty}; - use common::instrament; + use super::binary_search_uneven; use rstest::rstest; - instrament! { - #[rstest] - fn test_power_set( - #[values(vec![], vec![1], vec![1, 2], vec![1, 2, 3])] - collection: Vec - ) (|data: &TestPowerSet| { - let result = _power_set(collection.clone()); - insta::assert_yaml_snapshot!(data.to_string(), result); - }) - } - - instrament! { - #[rstest] - fn test_power_set_without_empty( - #[values(vec![], vec![1], vec![1, 2], vec![1, 2, 3])] - collection: Vec - ) (|data: &TestPowerSetWithoutEmpty| { - let result = power_set_without_empty(collection.clone()); - insta::assert_yaml_snapshot!(data.to_string(), result); - }) - } - #[rstest] // Base cases, all elements present in any position. #[case("abc", "abc,def,ghi,jkl,mno,pqr,stu,vwx,yz", ',', true)] @@ -173,6 +115,6 @@ mod tests { #[case] sep: char, #[case] expected: bool, ) { - assert_eq!(super::binary_search_uneven(needle, haystack, sep), expected); + assert_eq!(binary_search_uneven(needle, haystack, sep), expected); } } diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[].snap b/common/src/snapshots/common__itertools__tests__test_power_set-[].snap similarity index 61% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[].snap rename to common/src/snapshots/common__itertools__tests__test_power_set-[].snap index 856bdebd..e3509d37 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set-[].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: [] diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_3,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_3,_].snap similarity index 80% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_3,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_3,_].snap index 616dc00b..76dfc05e 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_3,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_3,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_].snap similarity index 72% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_].snap index c1885fea..162673a7 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_2,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_2,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_].snap similarity index 65% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set-[_1,_].snap index 1ef4245b..d7475bba 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set-[_1,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set-[_1,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[].snap b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[].snap similarity index 60% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[].snap rename to common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[].snap index 1f8a7c44..eab078b8 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: [] diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_3,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_3,_].snap similarity index 80% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_3,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_3,_].snap index 8e4322ff..e3f80f14 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_3,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_3,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_].snap similarity index 71% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_].snap index 0aeb3ec9..b1182346 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_2,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_2,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_].snap b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_].snap similarity index 63% rename from core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_].snap rename to common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_].snap index 777a3992..68971d6e 100644 --- a/core/src/util/snapshots/betterletter__util__iteration__tests__test_power_set_without_empty-[_1,_].snap +++ b/common/src/snapshots/common__itertools__tests__test_power_set_without_empty-[_1,_].snap @@ -1,5 +1,5 @@ --- -source: core/src/util/iteration.rs +source: common/src/itertools.rs expression: result info: collection: diff --git a/common/src/strings.rs b/common/src/strings.rs new file mode 100644 index 00000000..db8731cc --- /dev/null +++ b/common/src/strings.rs @@ -0,0 +1,91 @@ +use log::trace; + +pub fn titlecase(word: &str) -> String { + let mut chars = word.chars(); + let mut result = String::with_capacity(word.len()); + + if let Some(c) = chars.next() { + for upper in c.to_uppercase() { + result.push(upper); + } + } + + for c in chars { + for lower in c.to_lowercase() { + result.push(lower); + } + } + + result +} + +pub fn is_compound_word(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { + trace!("Checking if word is valid compound word: '{}'", word); + + let indices = word.char_indices().skip(1); + + // Greedily fetch the longest possible prefix. Otherwise, we short-circuit and might + // end up looking for (for example) "He" of "Heizölrechnung" and its suffix + // "izölrechnung" (not a word), whereas we could have found "Heizöl" and "Rechnung" + // instead. + let mut highest_valid_index = None; + for (i, _) in indices { + let prefix = &word[..i]; + + if predicate(prefix) { + highest_valid_index = Some(i); + } + } + + match highest_valid_index { + Some(i) => { + let suffix = &word[i..]; + + trace!( + "Prefix '{}' found in word list, seeing if suffix '{}' is valid.", + &word[..i], + suffix + ); + + predicate(&titlecase(suffix)) + || predicate(suffix) + || is_compound_word(&titlecase(suffix), predicate) + || is_compound_word(suffix, predicate) + } + None => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("hello", "Hello")] + #[case("bItTe", "Bitte")] + #[case("dANKE", "Danke")] + #[case("übel", "Übel")] + #[case("uebel", "Uebel")] + #[case("😀", "😀")] + #[case("ßuper", "SSuper")] + #[case("ẞuperduper", "ẞuperduper")] + #[case("WOW!!", "Wow!!")] + #[case("ẞß", "ẞß")] + fn test_titlecase(#[case] word: &str, #[case] expected: &str) { + assert_eq!(titlecase(word), expected); + } + + const WORDS: &[&str] = &["Süßwasser", "schwimm", "Bäder", "Mauer", "Dübel", "Kübel"]; + + #[rstest] + #[case("Süßwasserschwimmbäder", true)] + #[case("Mauerdübel", true)] + #[case("Mauerdübelkübel", true)] + #[case("Not a compound word", false)] + #[case("Mauer好", false)] + #[case("Mauerdjieojoid", false)] + fn test_is_compound_word(#[case] word: &str, #[case] expected: bool) { + assert_eq!(is_compound_word(word, &|w| WORDS.contains(&w)), expected); + } +} diff --git a/core/Cargo.toml b/core/Cargo.toml index b8b29143..ad688645 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -2,6 +2,20 @@ name = "betterletter" version = "0.1.0" edition = "2021" +authors = ["Alex Povel "] +description = "Substitute alternative, ASCII-only spellings of special characters with their Unicode equivalents." +license-file = "LICENSE" +repository = "https://github.com/alexpovel/betterletter-rs" +readme = "README.md" +documentation = "https://docs.rs/betterletter" +keywords = ["spelling", "umlaut", "substitute", "unicode", "alternative"] +categories = [ + "accessibility", + "internationalization", + "localization", + "text-processing", + "value-formatting", +] [dependencies] cached = "0.43.0" diff --git a/core/benches/lookup.rs b/core/benches/lookup.rs index b57a9d4c..f7e7dfcb 100644 --- a/core/benches/lookup.rs +++ b/core/benches/lookup.rs @@ -1,4 +1,4 @@ -use betterletter::util::iteration::binary_search_uneven; +use common::lookup::binary_search_uneven; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use std::collections::HashSet; diff --git a/core/benches/threading.rs b/core/benches/threading.rs index a1a03968..2ff1a06a 100644 --- a/core/benches/threading.rs +++ b/core/benches/threading.rs @@ -1,7 +1,6 @@ -use betterletter::apply; #[cfg(feature = "de")] -use betterletter::stages::german::German; -use betterletter::stages::Stage; +use betterletter::stages::GermanStage; +use betterletter::{apply, Stage}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use log::info; use std::io::{BufRead, Write}; @@ -16,7 +15,7 @@ fn process_single_threaded_german( mut source: &mut impl BufRead, mut destination: &mut impl Write, ) -> Result<(), std::io::Error> { - let stages: Vec> = vec![Box::new(German)]; + let stages: Vec> = vec![Box::new(GermanStage)]; apply(&stages, &mut source, &mut destination) } @@ -56,7 +55,7 @@ pub fn process_multi_threaded_german( // stage.process(&mut item).unwrap(); // } - let stage = German; + let stage = GermanStage; let result = stage.substitute(&item).unwrap(); let mut results = results_clone.lock().unwrap(); diff --git a/core/build.rs b/core/build.rs index 9e6afeaa..1cd795a9 100644 --- a/core/build.rs +++ b/core/build.rs @@ -1,4 +1,4 @@ -use common::is_compound_word; +use common::strings::is_compound_word; use std::collections::HashSet; use std::io::{BufReader, BufWriter, Read, Write}; use std::{ diff --git a/core/src/lib.rs b/core/src/lib.rs index aa991c55..613c74ca 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -1,3 +1,14 @@ +#![warn(clippy::all)] +#![warn(clippy::pedantic)] +#![warn(clippy::cargo)] +#![warn(missing_copy_implementations)] +#![warn(missing_debug_implementations)] +#![warn(trivial_casts, trivial_numeric_casts)] +#![warn(unused_qualifications)] +#![warn(variant_size_differences)] +#![forbid(unsafe_code)] +#![warn(missing_docs)] +#![allow(clippy::multiple_crate_versions)] //! Substitute alternative, ASCII-only spellings of special characters with their //! Unicode equivalents. //! @@ -9,21 +20,55 @@ pub use crate::stages::Stage; use log::{debug, info}; use std::io::{BufRead, Error, Write}; +/// Main components around [`Stage`]s and their [processing][Stage::substitute]. pub mod stages; -pub mod util; const EXPECTABLE_MAXIMUM_WORD_LENGTH_BYTES: u8 = 64; const EXPECTABLE_MAXIMUM_MATCHES_PER_WORD: u8 = 8; +/// Apply the list of [stages][Stage] to a source, writing results to the given +/// destination. +/// +/// The stages will be applied in the order given. The source is expected to be +/// UTF-8-encoded text, and will be read [line-by-line][BufRead::read_line]. Each +/// processed line will be written to the destination immediately. +/// +/// # Example: Using a single stage (German) +/// +/// See also [`crate::stages::GermanStage`]. +/// +/// +/// ``` +/// use betterletter::{apply, stages::GermanStage, Stage}; +/// use std::io::Cursor; +/// +/// let stages = vec![Box::new(GermanStage)].into_iter().map(|g| g as Box).collect(); +/// +/// let mut input = Cursor::new("Gruess Gott!\n"); +/// let mut output: Vec = Vec::new(); +/// +/// apply(&stages, &mut input, &mut output); +/// +/// assert_eq!(output, "Grüß Gott!\n".as_bytes()); +/// ``` +/// +/// # Errors +/// +/// An error will be returned in the following cases: +/// +/// - when a [`Stage`] fails its substitution +/// - when the source cannot be read +/// - when the destination cannot be written to +/// - when the destination cannot be flushed before exiting pub fn apply( stages: &Vec>, source: &mut impl BufRead, destination: &mut impl Write, ) -> Result<(), Error> { - let mut buf = String::new(); - const EOF_INDICATOR: usize = 0; + let mut buf = String::new(); + while source.read_line(&mut buf)? > EOF_INDICATOR { debug!("Starting processing line: '{}'", buf.escape_debug()); diff --git a/core/src/main.rs b/core/src/main.rs index 4616f214..b4a4a18d 100644 --- a/core/src/main.rs +++ b/core/src/main.rs @@ -1,8 +1,8 @@ use betterletter::apply; #[cfg(feature = "de")] -use betterletter::stages::german::German; +use betterletter::stages::GermanStage; #[cfg(feature = "symbols")] -use betterletter::stages::symbols::Symbols; +use betterletter::stages::SymbolsStage; use log::{debug, info}; use std::io::{self, BufReader, Error}; @@ -18,9 +18,9 @@ fn main() -> Result<(), Error> { .map(|stage| { let tp: Box = match stage { #[cfg(feature = "de")] - cli::Stage::German => Box::new(German), + cli::Stage::German => Box::new(GermanStage), #[cfg(feature = "symbols")] - cli::Stage::Symbols => Box::new(Symbols), + cli::Stage::Symbols => Box::new(SymbolsStage), }; debug!("Loaded stage: {:?}", stage); diff --git a/core/src/stages/german/driver.rs b/core/src/stages/german/driver.rs index 6e107d21..ea5fed66 100644 --- a/core/src/stages/german/driver.rs +++ b/core/src/stages/german/driver.rs @@ -1,28 +1,287 @@ -use crate::{ - stages::{ - german::{ - machine::{StateMachine, Transition}, - words::{Replace, Replacement}, - }, - Stage, StageResult, - }, - util::{ - iteration::{binary_search_uneven, power_set_without_empty}, - strings::WordCasing, +use crate::stages::{ + german::{ + machine::{StateMachine, Transition}, + words::{Replace, Replacement, WordCasing}, }, + tooling::StageResult, + Stage, }; use cached::proc_macro::cached; use cached::SizedCache; -use common::{is_compound_word, titlecase}; +use common::itertools::power_set_without_empty; +use common::lookup::binary_search_uneven; +use common::strings::{is_compound_word, titlecase}; use log::{debug, trace}; static VALID_GERMAN_WORDS: &str = include_str!(concat!(env!("OUT_DIR"), "/de.txt")); // Generated in `build.rs`. -#[derive(Clone, Copy)] -pub struct German; - -impl Stage for German { +/// German language stage, responsible for Umlauts and Eszett. +/// +/// This stage is responsible for applying the following rules, **where applicable**: +/// - [*Umlauts*](https://en.wikipedia.org/wiki/Umlaut_(diacritic)): replace `ue`, `oe`, +/// `ae` with `ü`, `ö`, `ä`, respectively, +/// - [*Eszett*](https://en.wikipedia.org/wiki/%C3%9F): replace `ss` with `ß`. +/// +/// Mechanisms are in place to uphold the following properties: +/// - both lower- and uppercase variants are handled correctly, +/// - compound words are handled correctly. +/// +/// Towards this, this stage does *not* simply replace all occurrences, but performs +/// checks to ensure only valid replacements are made. The core of these checks is an +/// exhaustive word list. The better the word list, the better the results. As such, any +/// errors in processing could be the result of a faulty word list *or* faulty +/// algorithms. +/// +/// # Example: A simple greeting, with Umlaut and Eszett +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let result: String = GermanStage.substitute("Gruess Gott!").unwrap().into(); +/// assert_eq!(result, "Grüß Gott!"); +/// ``` +/// +/// # Example: A compound word +/// +/// Note that this compound word is *not* part of the word list (that would be an +/// *elaborate* word list!), but is still handled, as its constituents are. +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let result: String = GermanStage.substitute("Du Suesswassertagtraeumer!").unwrap().into(); +/// assert_eq!(result, "Du Süßwassertagträumer!"); +/// ``` +/// +/// # Example: Words *validly* containing alternative Umlaut spelling +/// +/// These spellings are *not* replaced, as they are valid words in their own right. +/// Naive implementations/translations (e.g. +/// [`tr`](https://en.wikipedia.org/wiki/Tr_(Unix))) would not handle this correctly. +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// for word in &[ +/// // "ae" +/// "Aerodynamik", // should not be "Ärodynamik" +/// "Israel", // should not be "Isräl" +/// "Schufaeintrag", // should not be "Schufäintrag" +/// // "oe" +/// "Koeffizient", // should not be "Köffizient" +/// "Dominoeffekt", // should not be "Dominöffekt" +/// "Poet", // should not be "Pöt" +/// // "ue" +/// "Abenteuer", // should not be "Abenteür" +/// "Mauer", // should not be "Maür" +/// "Steuerung", // should not be "Steürung" +/// ] { +/// let result: String = GermanStage.substitute(word).unwrap().into(); +/// assert_eq!(result, word.to_string()); +/// } +/// ``` +/// +/// Note that `ss`/`ß` is not mentioned, as it is handled +/// [elsewhere](#example-words-with-valid-alternative-and-special-character-spellings). +/// +/// # Example: Words with valid alternative *and* special character spellings +/// +/// Some words are validly spelled with alternative Umlauts *and* special characters *in +/// the same position*, such as: +/// - [Ma**ß**e](https://de.wiktionary.org/wiki/Ma%C3%9Fe): pertaining to measurements +/// - [Ma**ss**e](https://de.wiktionary.org/wiki/Masse): pertaining to mass/weight +/// +/// So if a user inputs `Masse` (they can't spell `Maße`, else they wouldn't have +/// reached for this crate in the first place), what do they mean? Such cases are +/// tricky, as there isn't an easy solution without reaching for full-blown +/// [NLP](https://en.wikipedia.org/wiki/Natural_language_processing) or ML, as the +/// word's context would be required. This stage is much too limited for that. A choice +/// has to be made: +/// +/// - do not replace: keep alternative spelling, or +/// - replace: keep special character spelling. +/// +/// This tool chooses the latter, as it seems [the least +/// astonishing](https://en.wikipedia.org/wiki/Principle_of_least_astonishment) in the +/// context of this tool, whose entire point is to **make replacements if they're +/// valid**. +/// +/// This is an issue mainly for Eszett (`ß`), as for it, two valid spellings are much +/// more likely than for Umlauts. +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// for (input, output) in &[ +/// ("Busse", "Buße"), // busses / penance +/// ("Masse", "Maße"), // mass / measurements +/// ] { +/// let result: String = GermanStage.substitute(input).unwrap().into(); +/// assert_eq!(result, output.to_string()); +/// } +/// ``` +/// +/// # Example: Upper- and mixed case +/// +/// This stage can handle any case, but assumes **nouns are never lower case** (a pretty +/// mild assumption). The **first letter governs the case** of the entity (Umlaut, +/// Eszett or entire word) in question: +/// +/// | Input | Example Umlaut/Eszett | Example word | Detected case | +/// | ----- | --------------------- | ------------ | ------------- | +/// | `xx` | `ue` | `hello` | lowercase | +/// | `xX` | `sS` | `hElLo` | lowercase | +/// | `Xx` | `Ue` | `Hello` | uppercase | +/// | `XX` | `SS` | `HELLooo` | uppercase | +/// +/// The same principle then further applies to entire words, which is especially +/// noticeable for mixed-case ones. The word list is not going to contain mixed-case +/// words, so a decision has to be made: what case will candidates be checked against? +/// If whatever case was detected is not considered a valid word, the replacement is not +/// made. Example flows follow. +/// +/// ## Subexample: mixed case, invalid word +/// +/// The flow looks like: +/// +/// `aEpFeL` → lowercase Umlaut → `äpFeL` → lowercase word → squash → `äpfel` → ❌ → +/// output is `aEpFeL` +/// +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let result: String = GermanStage.substitute("aEpFeL").unwrap().into(); +/// +/// // Error: MiXeD CaSe noun without leading capital letter +/// assert_eq!(result, "aEpFeL"); +/// ``` +/// +/// ## Subexample: mixed case, valid word +/// +/// The flow looks like: +/// +/// `AePfEl` → uppercase Umlaut → `ÄPfEl` → uppercase word → squash → `Äpfel` → ✅ → +/// output is `Äpfel` +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let result: String = GermanStage.substitute("AePfEl").unwrap().into(); +/// +/// // OK: MiXeD CaSe words nouns are okay, *if* starting with a capital letter +/// assert_eq!(result, "ÄPfEl"); +/// ``` +/// +/// ## Subexample: other cases +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let f = |word: &str| -> String {GermanStage.substitute(word).unwrap().into()}; +/// +/// // OK: The normal case, adjective lowercase +/// assert_eq!(f("Voll suess!"), "Voll süß!"); +/// +/// // OK: Adjective uppercase (start of sentence) +/// assert_eq!(f("Suesses Eis!"), "Süßes Eis!"); +/// +/// // OK: Uppercased noun +/// assert_eq!(f("Aepfel"), "Äpfel"); +/// +/// // Error: Lowercased noun is *not* replaced, we are not a spell checker +/// assert_eq!(f("aepfel"), "aepfel"); +/// +/// // OK: SCREAMING CASE noun is okay though +/// assert_eq!(f("AEPFEL"), "ÄPFEL"); +/// +/// // OK: SCREAMING CASE verb is okay as well +/// assert_eq!(f("SCHLIESSEN"), "SCHLIEẞEN"); +/// +/// // OK: MiXeD CaSe verb: inserted special character is uppercase +/// assert_eq!(f("fUeLleN"), "fÜLleN"); +/// +/// // OK: MiXeD CaSe verb: inserted special character is lowercase +/// assert_eq!(f("FuElLEn"), "FülLEn"); +/// ``` +/// +/// ### Capital Eszett (ẞ) +/// +/// Note the spelling of `SCHLIEẞEN` containing `ẞ`, the [uppercase version of +/// `ß`](https://www.wikidata.org/wiki/Q9693), part of [official spelling since +/// 2017](https://web.archive.org/web/20230206102049/https://www.rechtschreibrat.com/DOX/rfdr_PM_2017-06-29_Aktualisierung_Regelwerk.pdf). +/// It's the result of uppercasing `ß` of `schließen`. This does **not** follow Rust's +/// usual behavior, which is why it is specially mentioned here: +/// +/// ``` +/// let lc = "ß"; +/// let uc = "ẞ"; +/// +/// assert_eq!(lc.to_uppercase().to_string(), "SS"); +/// +/// // The other way around works though: +/// assert_eq!(uc.to_lowercase().to_string(), lc); +/// +/// // Uppercase stays uppercase: +/// assert_eq!(uc.to_uppercase().to_string(), uc); +/// +/// // Lowercase stays lowercase (as opposed to `ss`): +/// assert_eq!(lc.to_lowercase().to_string(), lc); +/// ``` +/// +/// The `SS` of `SCHLIESSEN` is detected as an uppercase Eszett, which is specifically +/// inserted. You might want to run additional processing if this is undesired. +/// +/// # Example: Other bytes +/// +/// This stage handles the German alphabet *only*, and will leave other input bytes +/// untouched. You get to keep your trailing newlines, emojis (also multi-[`char`] ones), +/// and everything else. +/// +/// Of course, the input has to be valid UTF-8, as is ensured by its signature ([`str`]). +/// +/// ``` +/// use betterletter::{Stage, stages::GermanStage}; +/// +/// let result: String = GermanStage.substitute("\0Schoener 你好 Satz... 👋🏻\r\n\n").unwrap().into(); +/// assert_eq!(result, "\0Schöner 你好 Satz... 👋🏻\r\n\n"); +/// ``` +/// +/// # Performance +/// +/// This stage is implemented as a [finite state +/// machine](https://en.wikipedia.org/wiki/Finite-state_machine), which means it runs in +/// linear time as well as constant space. It is therefore very fast and memory +/// efficient, requiring only a single pass over the input [`str`]. +/// +/// The underlying checks for valid words are implemented as a +/// [memoized](https://en.wikipedia.org/wiki/Memoization), recursive binary search. +/// While they're fast, other methods could be faster but weren't chosen for one or more +/// of these reasons: +/// +/// - poor developer experience: +/// - [`clippy`](https://github.com/rust-lang/rust-clippy) would choke on them +/// - compilation times of 5 minutes and more (on fast hardware) +/// - large binary size: +/// +/// A simple array of strings, `&[&str]`, adds two [`usize`] in terms of overhead **per +/// [`str`]** (tuple of `(pointer, length)`), which is 16 bytes on 64-bit systems and +/// therefore **longer than the average word** (which sits at around 15 bytes, give or +/// take). Seeing as there can be hundreds of thousands, if not millions of entries, +/// this quickly *doubles* the binary size for no good reason. +/// - not available statically, aka at compile time, aka incurring a runtime cost. This +/// crate's binary is optimized for start-up speed. +/// +/// For more info, an overview of the methods tried +/// ([`phf`](https://crates.io/crates/phf) and more), and benchmarks, see [this +/// issue](https://github.com/alexpovel/betterletter-rs/issues/9). +#[derive(Debug, Clone, Copy)] +pub struct GermanStage; + +impl Stage for GermanStage { fn substitute(&self, input: &str) -> StageResult { + const INDICATOR: char = '\0'; + debug!("Working on input '{}'", input.escape_debug()); let mut output = String::with_capacity(input.len()); @@ -31,14 +290,13 @@ impl Stage for German { // The state machine, much like a missing trailing newline in a file, will // misbehave if the very last transition is not an 'external' one (the last word // won't be detected properly). - const INDICATOR: char = '\0'; for char in input.chars().chain(std::iter::once(INDICATOR)) { trace!( "Beginning processing of character '{}'", char.escape_debug() ); - let transition = machine.transition(&char); + let transition = machine.transition(char); trace!("Transition is '{:?}'", transition); @@ -71,8 +329,7 @@ impl Stage for German { let c = output.pop(); debug_assert!( c == Some(INDICATOR), - "Trailing indicator byte expected, but found '{:?}'.", - c + "Trailing indicator byte expected, but found '{c:?}'." ); debug!("Final output string is '{}'", output.escape_debug()); @@ -82,7 +339,7 @@ impl Stage for German { } fn find_valid_replacement(word: &str, replacements: &[Replacement]) -> Option { - let replacement_combinations = power_set_without_empty(replacements.iter().cloned()); + let replacement_combinations = power_set_without_empty(replacements.iter().copied()); debug!("Starting search for valid replacement for word '{}'", word); trace!( "All replacement combinations to try: {:?}", @@ -100,9 +357,9 @@ fn find_valid_replacement(word: &str, replacements: &[Replacement]) -> Option bool) -> bool { match casing { Ok(WordCasing::AllLowercase) => { - // Adjectives, verbs, etc.: always lowercase. Nouns are *never* assumed to - // occur all lowercase (e.g. "laufen"). In any case, there is no further - // processing we can/want to do (or is there... + // There is no further processing we can/want to do (or is there... // https://www.youtube.com/watch?v=HLRdruqQfRk). predicate(word) } - Ok(WordCasing::AllUppercase | WordCasing::Mixed) => { - // Before proceeding, convert `SCREAMING` or `MiXeD` words to something - // sensible, then see from there (e.g. "ABENTEUER" -> "Abenteuer", - // "üBeRTrIeBeN" -> "Übertrieben"). See `Titlecase` for what happens next. - + Ok(WordCasing::AllUppercase) => { + // Convert to something sensible before proceeding. let tc = titlecase(word); debug_assert!( WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase), @@ -146,6 +398,22 @@ fn is_valid(word: &str, predicate: &impl Fn(&str) -> bool) -> bool { is_valid(&tc, predicate) } + Ok(WordCasing::Mixed) => { + // For MiXeD casing, the word's first character governs its further + // treatment. + match word.chars().next() { + Some(c) if c.is_uppercase() => { + let tc = titlecase(word); + debug_assert!( + WordCasing::try_from(tc.as_str()) == Ok(WordCasing::Titlecase), + "Titlecased word, but isn't categorized correctly." + ); + + is_valid(&tc, predicate) + } + _ => is_valid(&word.to_lowercase(), predicate), + } + } Ok(WordCasing::Titlecase) => { // Regular nouns are normally titlecase, so see if they're found // immediately (e.g. "Haus"). @@ -174,7 +442,7 @@ mod tests { let original = VALID_GERMAN_WORDS.lines().collect_vec(); let mut sorted = VALID_GERMAN_WORDS.lines().collect_vec(); - sorted.sort(); + sorted.sort_unstable(); // see also: clippy::stable_sort_primitive assert_eq!(original, sorted.as_slice()); } @@ -184,7 +452,7 @@ mod tests { let original = VALID_GERMAN_WORDS.lines().collect_vec(); let mut unique = VALID_GERMAN_WORDS.lines().collect_vec(); - unique.sort(); + unique.sort_unstable(); // see also: clippy::stable_sort_primitive unique.dedup(); assert_eq!(original, unique.as_slice()); @@ -193,7 +461,7 @@ mod tests { #[test] fn test_word_list_is_not_filtered() { assert!( - VALID_GERMAN_WORDS.lines().any(|word| word.is_ascii()), + VALID_GERMAN_WORDS.lines().any(str::is_ascii), concat!( "Looks like you're using a filtered word list containing only special characters.", " The current implementation relies on the full word list (also containing all non-Umlaut words)" @@ -270,7 +538,7 @@ mod tests { word: String ) (|data: &TestProcess| { let input = word.clone(); - let result = German{}.substitute(&input).unwrap(); + let result = GermanStage{}.substitute(&input).unwrap(); insta::assert_yaml_snapshot!(data.to_string(), result.0); } ) diff --git a/core/src/stages/german/machine.rs b/core/src/stages/german/machine.rs index 60dab943..3469f16d 100644 --- a/core/src/stages/german/machine.rs +++ b/core/src/stages/german/machine.rs @@ -68,14 +68,17 @@ impl StateMachine { }; } - pub fn transition(&mut self, input: &MachineInput) -> Transition { + pub fn transition(&mut self, input: MachineInput) -> Transition { self.pre_transition(); let next = match (&self.state, input) { - (State::Word(Some(Potential(Umlaut(umlaut)))), c @ 'e' | c @ 'E') => { + (State::Word(Some(Potential(Umlaut(umlaut)))), c @ ('e' | 'E')) => { + const LENGTH_OF_PREVIOUS_CHARACTER: usize = 1; + let pos = self.word.len(); - const LENGTH_OF_PREVIOUS_CHARACTER: usize = 1; + // We're in a state machine, so we cannot know the length of the + // previous character, as have to assume its length here. debug_assert!( 'o'.len_utf8() == LENGTH_OF_PREVIOUS_CHARACTER && 'u'.len_utf8() == LENGTH_OF_PREVIOUS_CHARACTER @@ -93,7 +96,7 @@ impl StateMachine { State::Word(None) } - (State::Word(Some(Potential(Eszett(casing)))), c @ 's' | c @ 'S') => { + (State::Word(Some(Potential(Eszett(casing)))), c @ ('s' | 'S')) => { let pos = self.word.len(); let start = pos - c.len_utf8(); // Previous char same as current `c` @@ -129,9 +132,9 @@ impl StateMachine { transition } - fn post_transition(&mut self, input: &MachineInput) { + fn post_transition(&mut self, input: MachineInput) { if let Some(Transition::Entered | Transition::Internal) = self.transition { - self.word.push(*input); + self.word.push(input); trace!( "Appending {:?} to current word due to transition {:?}.", input, diff --git a/core/src/stages/german/mod.rs b/core/src/stages/german/mod.rs index 116c5e84..f72a0351 100644 --- a/core/src/stages/german/mod.rs +++ b/core/src/stages/german/mod.rs @@ -3,5 +3,6 @@ mod machine; mod words; // Re-export symbols. -pub use driver::German; +#[allow(clippy::module_name_repetitions)] +pub use driver::GermanStage; pub(self) use words::{LetterCasing, SpecialCharacter, Umlaut, Word}; diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-.snap similarity index 70% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-.snap index 73f6dcb3..47d548b1 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: "" diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_SCREAMING.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_SCREAMING.snap similarity index 71% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_SCREAMING.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_SCREAMING.snap index 78e580ee..4bee9a59 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_SCREAMING.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_SCREAMING.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: SCREAMING diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_WOW!!.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_WOW!!.snap similarity index 77% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_WOW!!.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_WOW!!.snap index 2bac8af3..dbcf5246 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_WOW!!.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_WOW!!.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: WOW!! diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_bItTe.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_bItTe.snap similarity index 68% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_bItTe.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_bItTe.snap index c64518b6..f72a9559 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_bItTe.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_bItTe.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: bItTe diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_dANKE.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_dANKE.snap similarity index 68% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_dANKE.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_dANKE.snap index eb6b0bf6..3d4cd661 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_dANKE.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_dANKE.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: dANKE diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_hello.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_hello.snap similarity index 70% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_hello.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_hello.snap index 7be3cfe6..835242c3 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_hello.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_hello.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: hello diff --git a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_uebel.snap b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_uebel.snap similarity index 70% rename from core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_uebel.snap rename to core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_uebel.snap index 0e5110dc..d4016903 100644 --- a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_uebel.snap +++ b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_uebel.snap @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: uebel diff --git "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\237uper.snap" "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\237uper.snap" similarity index 70% rename from "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\237uper.snap" rename to "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\237uper.snap" index b081d177..a77c03aa 100644 --- "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\237uper.snap" +++ "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\237uper.snap" @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: ßuper diff --git "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\274bel.snap" "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\274bel.snap" similarity index 70% rename from "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\274bel.snap" rename to "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\274bel.snap" index 2e3661ac..6c9467eb 100644 --- "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\303\274bel.snap" +++ "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\303\274bel.snap" @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: übel diff --git "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" similarity index 71% rename from "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" rename to "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" index 5933d908..df9cd24e 100644 --- "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" +++ "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236uperduper.snap" @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: ẞuperduper diff --git "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" similarity index 69% rename from "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" rename to "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" index bf6441f5..3bbf0cef 100644 --- "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" +++ "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\341\272\236\303\237.snap" @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: ẞß diff --git "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\360\237\230\200.snap" "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\360\237\230\200.snap" similarity index 77% rename from "core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\360\237\230\200.snap" rename to "core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\360\237\230\200.snap" index b3b2a3ca..78bf9db0 100644 --- "a/core/src/util/snapshots/betterletter__util__strings__tests__test_word_casing_from_string-_\360\237\230\200.snap" +++ "b/core/src/stages/german/snapshots/betterletter__stages__german__words__tests__test_word_casing_from_string-_\360\237\230\200.snap" @@ -1,5 +1,5 @@ --- -source: core/src/util/strings.rs +source: core/src/stages/german/words.rs expression: "WordCasing::try_from(word.as_str())" info: word: 😀 diff --git a/core/src/stages/german/words.rs b/core/src/stages/german/words.rs index b4a0e620..9a5bbb67 100644 --- a/core/src/stages/german/words.rs +++ b/core/src/stages/german/words.rs @@ -1,6 +1,54 @@ use itertools::Itertools; use std::fmt::Display; +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum WordCasing { + AllLowercase, + AllUppercase, + Titlecase, + Mixed, +} + +impl TryFrom<&str> for WordCasing { + type Error = &'static str; + + fn try_from(value: &str) -> Result { + if value.is_empty() { + return Err("String is empty"); + } + + let mut has_lowercase = false; + let mut has_uppercase = false; + let mut is_titlecase = true; + + for (i, c) in value.chars().enumerate() { + if c.is_lowercase() { + has_lowercase = true; + + if i == 0 { + is_titlecase = false; + } + } else if c.is_uppercase() { + has_uppercase = true; + + if i != 0 { + is_titlecase = false; + } + } else { + return Err("String contains characters with undecidable casing"); + } + } + + match (is_titlecase, has_lowercase, has_uppercase) { + (true, _, _) => Ok(Self::Titlecase), + (_, true, false) => Ok(Self::AllLowercase), + (_, false, true) => Ok(Self::AllUppercase), + (_, true, true) => Ok(Self::Mixed), + (_, false, false) => unreachable!("Impossible case: any non-empty string has either lower- or uppercase or returned an `Err` early."), + } + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(super) enum LetterCasing { Lower, @@ -148,8 +196,8 @@ impl Replace for String { // Assert sorting, such that reversing actually does the right thing. if cfg!(debug_assertions) { - let mut cloned = replacements.iter().cloned().collect_vec(); - cloned.sort_by_key(|replacement| replacement.start()); + let mut cloned = replacements.iter().copied().collect_vec(); + cloned.sort_by_key(crate::stages::german::words::Replacement::start); assert_eq!(cloned, replacements); } @@ -160,3 +208,46 @@ impl Replace for String { } } } + +#[cfg(test)] +mod tests { + use super::*; + use common::instrament; + use rstest::rstest; + use serde::Serialize; + + impl Serialize for WordCasing { + fn serialize(&self, serializer: S) -> Result { + match self { + Self::AllLowercase => serializer.serialize_str("AllLowercase"), + Self::AllUppercase => serializer.serialize_str("AllUppercase"), + Self::Titlecase => serializer.serialize_str("Titlecase"), + Self::Mixed => serializer.serialize_str("Mixed"), + } + } + } + + instrament! { + #[rstest] + fn test_word_casing_from_string( + #[values( + "hello", + "bItTe", + "dANKE", + "übel", + "uebel", + "😀", + "ßuper", + "ẞuperduper", + "WOW!!", + "SCREAMING", + "ẞß", + "", + )] + word: String + ) (|data: &TestWordCasingFromString| { + insta::assert_yaml_snapshot!(data.to_string(), WordCasing::try_from(word.as_str())); + } + ) + } +} diff --git a/core/src/stages/mod.rs b/core/src/stages/mod.rs index f2327d91..39eb9548 100644 --- a/core/src/stages/mod.rs +++ b/core/src/stages/mod.rs @@ -1,34 +1,10 @@ #[cfg(feature = "de")] -pub mod german; +mod german; #[cfg(feature = "symbols")] -pub mod symbols; +mod symbols; +/// Tooling (types, traits, ...) around stages. +pub mod tooling; -#[derive(Debug)] -pub struct StageError; - -impl From for std::io::Error { - fn from(_: StageError) -> Self { - std::io::Error::new(std::io::ErrorKind::Other, "Error in text processor.") - } -} - -#[derive(Debug)] -pub struct SubstitutedString(pub String); - -impl From for String { - fn from(s: SubstitutedString) -> Self { - s.0 - } -} - -impl From for SubstitutedString { - fn from(s: String) -> Self { - Self(s) - } -} - -pub type StageResult = Result; - -pub trait Stage: Send + Sync { - fn substitute(&self, input: &str) -> StageResult; -} +pub use german::GermanStage; +pub use symbols::SymbolsStage; +pub use tooling::Stage; diff --git a/core/src/stages/symbols/mod.rs b/core/src/stages/symbols/mod.rs index a8f5d9b3..018a561d 100644 --- a/core/src/stages/symbols/mod.rs +++ b/core/src/stages/symbols/mod.rs @@ -1,9 +1,11 @@ -use super::{Stage, StageResult}; +use super::{tooling::StageResult, Stage}; -#[derive(Clone, Copy)] -pub struct Symbols; +/// Symbols stage, responsible for symbols such as `—` and `→`. +#[derive(Debug, Clone, Copy)] +#[allow(clippy::module_name_repetitions)] +pub struct SymbolsStage; -impl Stage for Symbols { +impl Stage for SymbolsStage { fn substitute(&self, input: &str) -> StageResult { Ok(String::from(input).into()) } diff --git a/core/src/stages/tooling.rs b/core/src/stages/tooling.rs new file mode 100644 index 00000000..d25fdb23 --- /dev/null +++ b/core/src/stages/tooling.rs @@ -0,0 +1,65 @@ +use std::error::Error; + +/// An error that occurred during processing in a stage. +#[derive(Debug, Copy, Clone)] +pub struct StageError; + +impl From for std::io::Error { + fn from(e: StageError) -> Self { + std::io::Error::new(std::io::ErrorKind::Other, e.to_string()) + } +} + +impl Error for StageError {} + +impl std::fmt::Display for StageError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Error in processing stage") + } +} + +/// A string that has been substituted by a stage. +/// +/// This is a +/// [newtype](https://doc.rust-lang.org/rust-by-example/generics/new_types.html), used +/// for increased clarity. +#[derive(Debug)] +pub struct SubstitutedString( + /// The actual string contents. + pub String, +); + +/// Convert a [`SubstitutedString`] into a [`String`]. +/// +/// Convenience method. +impl From for String { + fn from(s: SubstitutedString) -> Self { + s.0 + } +} + +/// Convert a [`String`] into a [`SubstitutedString`]. +/// +/// Convenience method. +impl From for SubstitutedString { + fn from(s: String) -> Self { + Self(s) + } +} + +/// The [`Result`] of a stage: we either [substituted properly][SubstitutedString], or [failed][StageError]. +pub type StageResult = Result; + +/// A stage in the processing pipeline, as initiated by [`crate::apply`]. +/// +/// Stages are the core of the text processing pipeline and can be applied in any order, +/// [any number of times each](https://en.wikipedia.org/wiki/Idempotence) (more than +/// once being wasted work, though). +pub trait Stage: Send + Sync { + /// Substitute text in a given `input` string. + /// + /// # Errors + /// + /// This method can error out if the stage fails to process the input. + fn substitute(&self, input: &str) -> StageResult; +} diff --git a/core/src/util/mod.rs b/core/src/util/mod.rs deleted file mode 100644 index ea304aa2..00000000 --- a/core/src/util/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod iteration; -pub mod strings; diff --git a/core/src/util/strings.rs b/core/src/util/strings.rs deleted file mode 100644 index 21176118..00000000 --- a/core/src/util/strings.rs +++ /dev/null @@ -1,90 +0,0 @@ -#[derive(Debug, PartialEq, Eq)] -pub(crate) enum WordCasing { - AllLowercase, - AllUppercase, - Titlecase, - Mixed, -} - -impl TryFrom<&str> for WordCasing { - type Error = &'static str; - - fn try_from(value: &str) -> Result { - if value.is_empty() { - return Err("String is empty"); - } - - let mut has_lowercase = false; - let mut has_uppercase = false; - let mut is_titlecase = true; - - for (i, c) in value.chars().enumerate() { - if c.is_lowercase() { - has_lowercase = true; - - if i == 0 { - is_titlecase = false; - } - } else if c.is_uppercase() { - has_uppercase = true; - - if i != 0 { - is_titlecase = false; - } - } else { - return Err("String contains characters with undecidable casing"); - } - } - - match (is_titlecase, has_lowercase, has_uppercase) { - (true, _, _) => Ok(Self::Titlecase), - (_, true, false) => Ok(Self::AllLowercase), - (_, false, true) => Ok(Self::AllUppercase), - (_, true, true) => Ok(Self::Mixed), - (_, false, false) => unreachable!("Impossible case: any non-empty string has either lower- or uppercase or returned an `Err` early."), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use common::instrament; - use rstest::rstest; - use serde::Serialize; - - impl Serialize for WordCasing { - fn serialize(&self, serializer: S) -> Result { - match self { - Self::AllLowercase => serializer.serialize_str("AllLowercase"), - Self::AllUppercase => serializer.serialize_str("AllUppercase"), - Self::Titlecase => serializer.serialize_str("Titlecase"), - Self::Mixed => serializer.serialize_str("Mixed"), - } - } - } - - instrament! { - #[rstest] - fn test_word_casing_from_string( - #[values( - "hello", - "bItTe", - "dANKE", - "übel", - "uebel", - "😀", - "ßuper", - "ẞuperduper", - "WOW!!", - "SCREAMING", - "ẞß", - "", - )] - word: String - ) (|data: &TestWordCasingFromString| { - insta::assert_yaml_snapshot!(data.to_string(), WordCasing::try_from(word.as_str())); - } - ) - } -}