diff --git a/src/app.rs b/src/app.rs index 54d6625..bd4df15 100644 --- a/src/app.rs +++ b/src/app.rs @@ -214,6 +214,12 @@ pub fn app() -> App<'static, 'static> { .arg(flag_name("SCRIPT")) .arg(flag_chars.clone()) .arg(flag_trie_set.clone()) + .arg(Arg::with_name("enum") + .long("enum") + .help("Emit a single table that maps codepoints to scripts.")) + .arg(Arg::with_name("rust-enum") + .long("rust-enum") + .help("Emit a Rust enum and a table that maps codepoints to scripts.")) .arg(Arg::with_name("include") .long("include") .takes_value(true) diff --git a/src/script.rs b/src/script.rs index 9fd77dd..897c64e 100644 --- a/src/script.rs +++ b/src/script.rs @@ -25,12 +25,21 @@ pub fn command_script(args: ArgMatches) -> Result<()> { } let mut wtr = args.writer("script")?; - wtr.names(by_name.keys().filter(|n| filter.contains(n)))?; - for (name, set) in by_name { - if filter.contains(&name) { - wtr.ranges(&name, &set)?; + if args.is_present("enum") { + wtr.ranges_to_enum(args.name(), &by_name)?; + } else if args.is_present("rust-enum") { + let mut variants = vec!["Unknown"]; + variants.extend(by_name.keys().map(String::as_str)); + wtr.ranges_to_rust_enum(args.name(), &variants, &by_name)?; + } else { + wtr.names(by_name.keys().filter(|n| filter.contains(n)))?; + for (name, set) in by_name { + if filter.contains(&name) { + wtr.ranges(&name, &set)?; + } } } + Ok(()) } diff --git a/src/util.rs b/src/util.rs index 42961ee..e50954b 100644 --- a/src/util.rs +++ b/src/util.rs @@ -222,10 +222,11 @@ pub fn range_add(ranges: &mut Vec<(u32, u32)>, codepoint: u32) { /// ranges. /// /// This panics if the same codepoint is present multiple times. -pub fn to_range_values(it: I) -> Vec<(u32, u32, u64)> - where I: IntoIterator +pub fn to_range_values(it: I) -> Vec<(u32, u32, V)> + where I: IntoIterator, + V: Ord { - let mut codepoints: Vec<(u32, u64)> = it.into_iter().collect(); + let mut codepoints: Vec<(u32, V)> = it.into_iter().collect(); codepoints.sort(); codepoints.dedup(); @@ -244,14 +245,14 @@ pub fn to_range_values(it: I) -> Vec<(u32, u32, u64)> /// /// This panics if the given codepoint is already in the ranges or if a /// codepoint is given out of order. -pub fn range_value_add( - ranges: &mut Vec<(u32, u32, u64)>, +pub fn range_value_add( + ranges: &mut Vec<(u32, u32, V)>, codepoint: u32, - value: u64, + value: V, ) { - if let Some(&mut (_, ref mut end, value2)) = ranges.last_mut() { + if let Some(&mut (_, ref mut end, ref value2)) = ranges.last_mut() { assert!(*end < codepoint); - if codepoint == *end + 1 && value == value2 { + if codepoint == *end + 1 && &value == value2 { *end = codepoint; return; } diff --git a/src/writer.rs b/src/writer.rs index e309488..cbae1a3 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,6 +1,7 @@ use std::char; use std::collections::{BTreeMap, BTreeSet}; use std::env; +use std::fmt; use std::fs::File; use std::io::{self, Write}; use std::mem::size_of; @@ -284,6 +285,64 @@ impl Writer { Ok(()) } + /// Write a map that associates codepoint ranges to a single value in a + /// Rust enum. + /// + /// The given map should be a map from the enum variant value to the set + /// of codepoints that have that value. + pub fn ranges_to_rust_enum( + &mut self, + name: &str, + variants: &[&str], + enum_map: &BTreeMap>, + ) -> Result<()> { + self.header()?; + self.separator()?; + + writeln!(self.wtr, "#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]")?; + let enum_name = rust_type_name(name); + writeln!(self.wtr, "pub enum {} {{", enum_name)?; + for variant in variants { + self.wtr.write_str(&format!("{}, ", rust_type_name(variant)))?; + } + writeln!(self.wtr, "}}\n")?; + + let mut map = BTreeMap::new(); + for (variant, ref set) in enum_map.iter() { + map.extend(set.iter().cloned().map(|cp| (cp, variant))); + } + let ranges = util::to_range_values( + map.iter().map(|(&k, &v)| (k, rust_type_name(v)))); + self.ranges_to_enum_slice(name, &enum_name, &ranges)?; + self.wtr.flush()?; + Ok(()) + } + + fn ranges_to_enum_slice( + &mut self, + name: &str, + enum_ty: &str, + table: &[(u32, u32, S)], + ) -> Result<()> + where S: fmt::Display + { + let cp_ty = self.rust_codepoint_type(); + + writeln!( + self.wtr, + "pub const {}: &'static [({}, {}, {})] = &[", + name, cp_ty, cp_ty, enum_ty)?; + for (start, end, variant) in table { + let range = (self.rust_codepoint(*start), self.rust_codepoint(*end)); + if let (Some(start), Some(end)) = range { + let src = format!("({}, {}, {}::{}), ", start, end, enum_ty, variant); + self.wtr.write_str(&src)?; + } + } + writeln!(self.wtr, "];")?; + Ok(()) + } + /// Write a map that associates ranges of codepoints with an arbitrary /// integer. /// @@ -1053,6 +1112,21 @@ fn rust_const_name(s: &str) -> String { s } +/// Heuristically produce an appropriate Rust type name. +fn rust_type_name(s: &str) -> String { + // Convert to PascalCase + s.split(|c: char| c.is_whitespace() || c == '.' || c == '_' || c == '-') + .map(|component| { + // Upper first char + let lower = component.to_ascii_lowercase(); + let mut chars = lower.chars(); + match chars.next() { + None => String::new(), + Some(f) => f.to_uppercase().collect::() + chars.as_str(), + } + }).collect() +} + /// Heuristically produce an appropriate module Rust name. fn rust_module_name(s: &str) -> String { // Property names/values seem pretty uniform, particularly the @@ -1118,7 +1192,7 @@ fn smallest_unsigned_type(n: u64) -> &'static str { #[cfg(test)] mod tests { - use super::pack_str; + use super::{pack_str, rust_type_name}; fn unpack_str(mut encoded: u64) -> String { let mut value = String::new(); @@ -1140,4 +1214,13 @@ mod tests { assert!(pack_str("ABCDEFGHI").is_err()); assert!(pack_str("AB\x00CD").is_err()); } + + #[test] + fn test_rust_type_name() { + assert_eq!(&rust_type_name("SCRIPT"), "Script"); + assert_eq!(&rust_type_name("dot.separated"), "DotSeparated"); + assert_eq!(&rust_type_name("dash-separated"), "DashSeparated"); + assert_eq!(&rust_type_name("white \tspace"), "WhiteSpace"); + assert_eq!(&rust_type_name("snake_case"), "SnakeCase"); + } }