Implement normalization for combining diacritics

Yaulendil · Nov 12, 2022 · d2b423a · d2b423a
1 parent 529b978
commit d2b423a
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 3 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,6 +25,7 @@ panic = "abort"
 [dependencies]
 cfg-if = "1.0.0"
 itertools = "0.10.0"
+unicode-normalization = "0.1.22"
 
 [dependencies.clap]
 version = "4.0.18"

diff --git a/TODO.md b/TODO.md
@@ -5,7 +5,7 @@
 
 ### CLI
 ### Input
-- [ ] Normalize diacritics in input
+- [x] Normalize diacritics in input
 - [ ] Support (duo)decimal points OR drop generic in `Numeral`
 ### Library
 - [ ] Redo ZWJ ligature cases

diff --git a/src/mode/iter.rs b/src/mode/iter.rs
@@ -1,3 +1,4 @@
+use unicode_normalization::UnicodeNormalization;
 use crate::{characters::punctuation, Token, Transcriber};
 use super::{ParseAction, TengwarMode};
 
@@ -76,8 +77,7 @@ impl<M: TengwarMode> Tokenizer<M> {
 
     /// Set up a new Tokenizer over UTF-8 text.
     pub fn from_str(s: impl AsRef<str>) -> Self {
-        //  TODO: Normalize combining diacritics into single `char`s.
-        Self::new(s.as_ref().chars().collect())
+        Self::new(s.as_ref().nfc().collect())
     }
 
     /// Wrap this [`Tokenizer`] in a [`Transcriber`] that can apply higher-level

diff --git a/src/mode/tests/quenya.rs b/src/mode/tests/quenya.rs
@@ -197,6 +197,31 @@ fn ligatures() {
 }
 
 
+#[test]
+fn normalization() {
+    //  Standard codepoints.
+    let ngaavea = test_tengwar!(Quenya, "ñávëa" => [
+        TENGWA_NOLDO, // ñ
+        CARRIER_LONG, TEHTA_A.base, // á
+        TENGWA_VALA, TEHTA_E.base, // vë
+        CARRIER_SHORT, TEHTA_A.base, // a
+    ]);
+    //  Combining diacritic equivalents.
+    test_tengwar!(Quenya, "ñávëa" == ngaavea);
+    test_tengwar!(Quenya, "ÑÁVËA" == ngaavea);
+
+    //  Standard codepoints.
+    let noole = test_tengwar!(Quenya, "ñólë" => [
+        TENGWA_NOLDO, TEHTA_O.base, TEHTA_O.base, // ñó
+        TENGWA_LAMBE, TEHTA_E.base, // lë
+    ]);
+    //  Combining diacritic equivalents.
+    test_tengwar!(Quenya, "ñólë" == noole); // Acute accent (U+0301).
+    test_tengwar!(Quenya, "ñólë" == noole); // Acute tone mark (U+0341).
+    test_tengwar!(Quenya, "ÑÓLË" == noole);
+}
+
+
 #[test]
 fn nuquernar() {
     //  Check Silmë.