From 588272c9ef526d4ba87c690adf424c610b933ed5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 6 Jul 2023 11:39:27 -0400 Subject: [PATCH] ucd-util: require callers to pass in JAMO_SHORT_NAME table This breaks a dependency where `ucd-util` dependend on running ucd-generate to produce the JAMO_SHORT_NAME table. Instead, we now require the caller to provide the table. Fixes #11 --- scripts/generate-unicode-tables | 3 --- src/jamo_short_name.rs | 22 +++++++++++++----- src/names.rs | 10 +++++++-- ucd-util/src/hangul.rs | 36 ++++++++++++++++++++---------- ucd-util/src/unicode_tables/mod.rs | 1 + 5 files changed, 50 insertions(+), 22 deletions(-) diff --git a/scripts/generate-unicode-tables b/scripts/generate-unicode-tables index 2fb32da..ea5b101 100755 --- a/scripts/generate-unicode-tables +++ b/scripts/generate-unicode-tables @@ -56,9 +56,6 @@ echo "generating tables for ucd-util tests" out="ucd-util/src/unicode_tables" ucd-generate property-names "$ucddir" > "$out/property_names.rs" ucd-generate property-values "$ucddir" > "$out/property_values.rs" - -echo "generating small JAMO_SHORT_NAME table for ucd-util" -out="ucd-util/src/unicode_tables" ucd-generate jamo-short-name "$ucddir" > "$out/jamo_short_name.rs" cargo +stable fmt diff --git a/src/jamo_short_name.rs b/src/jamo_short_name.rs index e802a34..eb07897 100644 --- a/src/jamo_short_name.rs +++ b/src/jamo_short_name.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::{collections::BTreeMap, path::Path}; use ucd_parse::{self, JamoShortName}; @@ -7,13 +7,25 @@ use crate::error::Result; pub fn command(args: ArgMatches<'_>) -> Result<()> { let dir = args.ucd_dir()?; - let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?; - + let map = jamo_map(&Path::new(dir))?; let mut wtr = args.writer("jamo_short_name")?; + wtr.codepoint_to_string(args.name(), &map)?; + Ok(()) +} + +fn jamo_map(dir: &Path) -> Result> { + let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?; let mut map = BTreeMap::new(); for (cp, jamo) in jamo_map { map.insert(cp.value(), jamo.name); } - wtr.codepoint_to_string(args.name(), &map)?; - Ok(()) + Ok(map) +} + +pub fn table(dir: &Path) -> Result> { + Ok(jamo_map(dir)?.into_iter().collect()) +} + +pub fn table_ref<'a>(table: &'a [(u32, String)]) -> Vec<(u32, &'a str)> { + table.iter().map(|&(cp, ref name)| (cp, &**name)).collect() } diff --git a/src/names.rs b/src/names.rs index 3fbd213..3715023 100644 --- a/src/names.rs +++ b/src/names.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::{collections::BTreeMap, path::Path}; use ucd_parse::{self, Codepoint, NameAlias, UnicodeData}; use ucd_util; @@ -8,6 +8,7 @@ use crate::error::Result; pub fn command(args: ArgMatches<'_>) -> Result<()> { let dir = args.ucd_dir()?; + let jamo_short_name_map = crate::jamo_short_name::table(Path::new(dir))?; let data = ucd_parse::parse_by_codepoint(&dir)?; let aliases = if args.is_present("no-aliases") { None @@ -17,6 +18,7 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> { let mut names = names_to_codepoint( &data, &aliases, + &crate::jamo_short_name::table_ref(&jamo_short_name_map), !args.is_present("no-ideograph"), !args.is_present("no-hangul"), ); @@ -83,6 +85,7 @@ impl NameTag { fn names_to_codepoint( data: &BTreeMap, aliases: &Option>>, + jamo_short_name_table: &[(u32, &str)], ideograph: bool, hangul: bool, ) -> BTreeMap { @@ -125,7 +128,10 @@ fn names_to_codepoint( for &(start, end) in ucd_util::RANGE_HANGUL_SYLLABLE { for cp in start..end + 1 { let v = (NameTag::Hangul, cp); - map.insert(ucd_util::hangul_name(cp).unwrap(), v); + map.insert( + ucd_util::hangul_name(jamo_short_name_table, cp).unwrap(), + v, + ); } } } diff --git a/ucd-util/src/hangul.rs b/ucd-util/src/hangul.rs index 53f61fb..7d0e0f0 100644 --- a/ucd-util/src/hangul.rs +++ b/ucd-util/src/hangul.rs @@ -1,5 +1,3 @@ -use crate::unicode_tables::jamo_short_name::JAMO_SHORT_NAME; - // This implementation should correspond to the algorithms described in // Unicode 3.12. @@ -22,16 +20,23 @@ const N_COUNT: u32 = 588; /// codepoint in the inclusive range `AC00..D7A3`, then this returns `None`. /// /// This implements the algorithms described in Unicode 3.12 and Unicode 4.8. -pub fn hangul_name(cp: u32) -> Option { +/// +/// The `table` given should be a map from codepoint to the corresponding +/// Jamo short name for that codepoint. If you're using `ucd-generate`, then +/// the table can be generated via the `jamo-short-name` sub-command. +pub fn hangul_name<'a>( + table: &'a [(u32, &'a str)], + cp: u32, +) -> Option { let mut name = "HANGUL SYLLABLE ".to_string(); let (lpart, vpart, tpart) = match hangul_full_canonical_decomposition(cp) { None => return None, Some(triple) => triple, }; - name.push_str(jamo_short_name(lpart)); - name.push_str(jamo_short_name(vpart)); - name.push_str(tpart.map_or("", jamo_short_name)); + name.push_str(jamo_short_name(table, lpart)); + name.push_str(jamo_short_name(table, vpart)); + name.push_str(tpart.map_or("", |cp| jamo_short_name(table, cp))); Some(name) } @@ -63,13 +68,17 @@ pub fn hangul_full_canonical_decomposition( Some((l_part, v_part, t_part)) } -fn jamo_short_name(cp: u32) -> &'static str { - let i = JAMO_SHORT_NAME.binary_search_by_key(&cp, |p| p.0).unwrap(); - JAMO_SHORT_NAME[i].1 +type JamoShortName<'a> = &'a [(u32, &'a str)]; + +fn jamo_short_name<'a>(table: JamoShortName<'a>, cp: u32) -> &'a str { + let i = table.binary_search_by_key(&cp, |p| p.0).unwrap(); + table[i].1 } #[cfg(test)] mod tests { + use crate::unicode_tables::jamo_short_name::JAMO_SHORT_NAME as TABLE; + use super::{hangul_full_canonical_decomposition, hangul_name}; #[test] @@ -82,18 +91,21 @@ mod tests { #[test] fn name() { - assert_eq!(hangul_name(0xD4DB).unwrap(), "HANGUL SYLLABLE PWILH"); + assert_eq!( + hangul_name(TABLE, 0xD4DB).unwrap(), + "HANGUL SYLLABLE PWILH" + ); } #[test] fn all() { for cp in 0xAC00..(0xD7A3 + 1) { - hangul_name(cp).unwrap(); + hangul_name(TABLE, cp).unwrap(); } } #[test] fn invalid() { - assert!(hangul_name(0).is_none()); + assert!(hangul_name(TABLE, 0).is_none()); } } diff --git a/ucd-util/src/unicode_tables/mod.rs b/ucd-util/src/unicode_tables/mod.rs index 558177d..40269e4 100644 --- a/ucd-util/src/unicode_tables/mod.rs +++ b/ucd-util/src/unicode_tables/mod.rs @@ -1,3 +1,4 @@ +#[cfg(test)] pub mod jamo_short_name; #[cfg(test)] pub mod property_names;