Skip to content

Commit

Permalink
Implement normalization for combining diacritics
Browse files Browse the repository at this point in the history
  • Loading branch information
Yaulendil committed Nov 12, 2022
1 parent 529b978 commit d2b423a
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 3 deletions.
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ panic = "abort"
[dependencies]
cfg-if = "1.0.0"
itertools = "0.10.0"
unicode-normalization = "0.1.22"

[dependencies.clap]
version = "4.0.18"
Expand Down
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

### CLI
### Input
- [ ] Normalize diacritics in input
- [x] Normalize diacritics in input
- [ ] Support (duo)decimal points OR drop generic in `Numeral`
### Library
- [ ] Redo ZWJ ligature cases
Expand Down
4 changes: 2 additions & 2 deletions src/mode/iter.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use unicode_normalization::UnicodeNormalization;
use crate::{characters::punctuation, Token, Transcriber};
use super::{ParseAction, TengwarMode};

Expand Down Expand Up @@ -76,8 +77,7 @@ impl<M: TengwarMode> Tokenizer<M> {

/// Set up a new Tokenizer over UTF-8 text.
pub fn from_str(s: impl AsRef<str>) -> Self {
// TODO: Normalize combining diacritics into single `char`s.
Self::new(s.as_ref().chars().collect())
Self::new(s.as_ref().nfc().collect())
}

/// Wrap this [`Tokenizer`] in a [`Transcriber`] that can apply higher-level
Expand Down
25 changes: 25 additions & 0 deletions src/mode/tests/quenya.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,31 @@ fn ligatures() {
}


#[test]
fn normalization() {
// Standard codepoints.
let ngaavea = test_tengwar!(Quenya, "ñávëa" => [
TENGWA_NOLDO, // ñ
CARRIER_LONG, TEHTA_A.base, // á
TENGWA_VALA, TEHTA_E.base, // vë
CARRIER_SHORT, TEHTA_A.base, // a
]);
// Combining diacritic equivalents.
test_tengwar!(Quenya, "ñávëa" == ngaavea);
test_tengwar!(Quenya, "ÑÁVËA" == ngaavea);

// Standard codepoints.
let noole = test_tengwar!(Quenya, "ñólë" => [
TENGWA_NOLDO, TEHTA_O.base, TEHTA_O.base, // ñó
TENGWA_LAMBE, TEHTA_E.base, // lë
]);
// Combining diacritic equivalents.
test_tengwar!(Quenya, "ñólë" == noole); // Acute accent (U+0301).
test_tengwar!(Quenya, "ñólë" == noole); // Acute tone mark (U+0341).
test_tengwar!(Quenya, "ÑÓLË" == noole);
}


#[test]
fn nuquernar() {
// Check Silmë.
Expand Down

0 comments on commit d2b423a

Please sign in to comment.