Skip to content

Commit

Permalink
ucd-util: require callers to pass in JAMO_SHORT_NAME table
Browse files Browse the repository at this point in the history
This breaks a dependency where `ucd-util` dependend on running
ucd-generate to produce the JAMO_SHORT_NAME table. Instead, we now
require the caller to provide the table.

Fixes #11
  • Loading branch information
BurntSushi committed Jul 7, 2023
1 parent cf7f4f0 commit 588272c
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 22 deletions.
3 changes: 0 additions & 3 deletions scripts/generate-unicode-tables
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ echo "generating tables for ucd-util tests"
out="ucd-util/src/unicode_tables"
ucd-generate property-names "$ucddir" > "$out/property_names.rs"
ucd-generate property-values "$ucddir" > "$out/property_values.rs"

echo "generating small JAMO_SHORT_NAME table for ucd-util"
out="ucd-util/src/unicode_tables"
ucd-generate jamo-short-name "$ucddir" > "$out/jamo_short_name.rs"

cargo +stable fmt
22 changes: 17 additions & 5 deletions src/jamo_short_name.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::BTreeMap;
use std::{collections::BTreeMap, path::Path};

use ucd_parse::{self, JamoShortName};

Expand All @@ -7,13 +7,25 @@ use crate::error::Result;

pub fn command(args: ArgMatches<'_>) -> Result<()> {
let dir = args.ucd_dir()?;
let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?;

let map = jamo_map(&Path::new(dir))?;
let mut wtr = args.writer("jamo_short_name")?;
wtr.codepoint_to_string(args.name(), &map)?;
Ok(())
}

fn jamo_map(dir: &Path) -> Result<BTreeMap<u32, String>> {
let jamo_map = ucd_parse::parse_by_codepoint::<_, JamoShortName>(dir)?;
let mut map = BTreeMap::new();
for (cp, jamo) in jamo_map {
map.insert(cp.value(), jamo.name);
}
wtr.codepoint_to_string(args.name(), &map)?;
Ok(())
Ok(map)
}

pub fn table(dir: &Path) -> Result<Vec<(u32, String)>> {
Ok(jamo_map(dir)?.into_iter().collect())
}

pub fn table_ref<'a>(table: &'a [(u32, String)]) -> Vec<(u32, &'a str)> {
table.iter().map(|&(cp, ref name)| (cp, &**name)).collect()
}
10 changes: 8 additions & 2 deletions src/names.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::BTreeMap;
use std::{collections::BTreeMap, path::Path};

use ucd_parse::{self, Codepoint, NameAlias, UnicodeData};
use ucd_util;
Expand All @@ -8,6 +8,7 @@ use crate::error::Result;

pub fn command(args: ArgMatches<'_>) -> Result<()> {
let dir = args.ucd_dir()?;
let jamo_short_name_map = crate::jamo_short_name::table(Path::new(dir))?;
let data = ucd_parse::parse_by_codepoint(&dir)?;
let aliases = if args.is_present("no-aliases") {
None
Expand All @@ -17,6 +18,7 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> {
let mut names = names_to_codepoint(
&data,
&aliases,
&crate::jamo_short_name::table_ref(&jamo_short_name_map),
!args.is_present("no-ideograph"),
!args.is_present("no-hangul"),
);
Expand Down Expand Up @@ -83,6 +85,7 @@ impl NameTag {
fn names_to_codepoint(
data: &BTreeMap<Codepoint, UnicodeData>,
aliases: &Option<BTreeMap<Codepoint, Vec<NameAlias>>>,
jamo_short_name_table: &[(u32, &str)],
ideograph: bool,
hangul: bool,
) -> BTreeMap<String, (NameTag, u32)> {
Expand Down Expand Up @@ -125,7 +128,10 @@ fn names_to_codepoint(
for &(start, end) in ucd_util::RANGE_HANGUL_SYLLABLE {
for cp in start..end + 1 {
let v = (NameTag::Hangul, cp);
map.insert(ucd_util::hangul_name(cp).unwrap(), v);
map.insert(
ucd_util::hangul_name(jamo_short_name_table, cp).unwrap(),
v,
);
}
}
}
Expand Down
36 changes: 24 additions & 12 deletions ucd-util/src/hangul.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use crate::unicode_tables::jamo_short_name::JAMO_SHORT_NAME;

// This implementation should correspond to the algorithms described in
// Unicode 3.12.

Expand All @@ -22,16 +20,23 @@ const N_COUNT: u32 = 588;
/// codepoint in the inclusive range `AC00..D7A3`, then this returns `None`.
///
/// This implements the algorithms described in Unicode 3.12 and Unicode 4.8.
pub fn hangul_name(cp: u32) -> Option<String> {
///
/// The `table` given should be a map from codepoint to the corresponding
/// Jamo short name for that codepoint. If you're using `ucd-generate`, then
/// the table can be generated via the `jamo-short-name` sub-command.
pub fn hangul_name<'a>(
table: &'a [(u32, &'a str)],
cp: u32,
) -> Option<String> {
let mut name = "HANGUL SYLLABLE ".to_string();
let (lpart, vpart, tpart) = match hangul_full_canonical_decomposition(cp) {
None => return None,
Some(triple) => triple,
};

name.push_str(jamo_short_name(lpart));
name.push_str(jamo_short_name(vpart));
name.push_str(tpart.map_or("", jamo_short_name));
name.push_str(jamo_short_name(table, lpart));
name.push_str(jamo_short_name(table, vpart));
name.push_str(tpart.map_or("", |cp| jamo_short_name(table, cp)));
Some(name)
}

Expand Down Expand Up @@ -63,13 +68,17 @@ pub fn hangul_full_canonical_decomposition(
Some((l_part, v_part, t_part))
}

fn jamo_short_name(cp: u32) -> &'static str {
let i = JAMO_SHORT_NAME.binary_search_by_key(&cp, |p| p.0).unwrap();
JAMO_SHORT_NAME[i].1
type JamoShortName<'a> = &'a [(u32, &'a str)];

fn jamo_short_name<'a>(table: JamoShortName<'a>, cp: u32) -> &'a str {
let i = table.binary_search_by_key(&cp, |p| p.0).unwrap();
table[i].1
}

#[cfg(test)]
mod tests {
use crate::unicode_tables::jamo_short_name::JAMO_SHORT_NAME as TABLE;

use super::{hangul_full_canonical_decomposition, hangul_name};

#[test]
Expand All @@ -82,18 +91,21 @@ mod tests {

#[test]
fn name() {
assert_eq!(hangul_name(0xD4DB).unwrap(), "HANGUL SYLLABLE PWILH");
assert_eq!(
hangul_name(TABLE, 0xD4DB).unwrap(),
"HANGUL SYLLABLE PWILH"
);
}

#[test]
fn all() {
for cp in 0xAC00..(0xD7A3 + 1) {
hangul_name(cp).unwrap();
hangul_name(TABLE, cp).unwrap();
}
}

#[test]
fn invalid() {
assert!(hangul_name(0).is_none());
assert!(hangul_name(TABLE, 0).is_none());
}
}
1 change: 1 addition & 0 deletions ucd-util/src/unicode_tables/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#[cfg(test)]
pub mod jamo_short_name;
#[cfg(test)]
pub mod property_names;
Expand Down

0 comments on commit 588272c

Please sign in to comment.