Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new option to generate a Rust enum for script #15

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ pub fn app() -> App<'static, 'static> {
.arg(flag_name("SCRIPT"))
.arg(flag_chars.clone())
.arg(flag_trie_set.clone())
.arg(Arg::with_name("enum")
.long("enum")
.help("Emit a single table that maps codepoints to scripts."))
.arg(Arg::with_name("rust-enum")
.long("rust-enum")
.help("Emit a Rust enum and a table that maps codepoints to scripts."))
.arg(Arg::with_name("include")
.long("include")
.takes_value(true)
Expand Down
17 changes: 13 additions & 4 deletions src/script.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,21 @@ pub fn command_script(args: ArgMatches) -> Result<()> {
}

let mut wtr = args.writer("script")?;
wtr.names(by_name.keys().filter(|n| filter.contains(n)))?;
for (name, set) in by_name {
if filter.contains(&name) {
wtr.ranges(&name, &set)?;
if args.is_present("enum") {
wtr.ranges_to_enum(args.name(), &by_name)?;
} else if args.is_present("rust-enum") {
let mut variants = vec!["Unknown"];
variants.extend(by_name.keys().map(String::as_str));
wtr.ranges_to_rust_enum(args.name(), &variants, &by_name)?;
} else {
wtr.names(by_name.keys().filter(|n| filter.contains(n)))?;
for (name, set) in by_name {
if filter.contains(&name) {
wtr.ranges(&name, &set)?;
}
}
}

Ok(())
}

Expand Down
17 changes: 9 additions & 8 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,11 @@ pub fn range_add(ranges: &mut Vec<(u32, u32)>, codepoint: u32) {
/// ranges.
///
/// This panics if the same codepoint is present multiple times.
pub fn to_range_values<I>(it: I) -> Vec<(u32, u32, u64)>
where I: IntoIterator<Item=(u32, u64)>
pub fn to_range_values<I, V>(it: I) -> Vec<(u32, u32, V)>
where I: IntoIterator<Item=(u32, V)>,
V: Ord
{
let mut codepoints: Vec<(u32, u64)> = it.into_iter().collect();
let mut codepoints: Vec<(u32, V)> = it.into_iter().collect();
codepoints.sort();
codepoints.dedup();

Expand All @@ -244,14 +245,14 @@ pub fn to_range_values<I>(it: I) -> Vec<(u32, u32, u64)>
///
/// This panics if the given codepoint is already in the ranges or if a
/// codepoint is given out of order.
pub fn range_value_add(
ranges: &mut Vec<(u32, u32, u64)>,
pub fn range_value_add<V: Eq>(
ranges: &mut Vec<(u32, u32, V)>,
codepoint: u32,
value: u64,
value: V,
) {
if let Some(&mut (_, ref mut end, value2)) = ranges.last_mut() {
if let Some(&mut (_, ref mut end, ref value2)) = ranges.last_mut() {
assert!(*end < codepoint);
if codepoint == *end + 1 && value == value2 {
if codepoint == *end + 1 && &value == value2 {
*end = codepoint;
return;
}
Expand Down
85 changes: 84 additions & 1 deletion src/writer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::char;
use std::collections::{BTreeMap, BTreeSet};
use std::env;
use std::fmt;
use std::fs::File;
use std::io::{self, Write};
use std::mem::size_of;
Expand Down Expand Up @@ -284,6 +285,64 @@ impl Writer {
Ok(())
}

/// Write a map that associates codepoint ranges to a single value in a
/// Rust enum.
///
/// The given map should be a map from the enum variant value to the set
/// of codepoints that have that value.
pub fn ranges_to_rust_enum(
&mut self,
name: &str,
variants: &[&str],
enum_map: &BTreeMap<String, BTreeSet<u32>>,
) -> Result<()> {
self.header()?;
self.separator()?;

writeln!(self.wtr, "#[derive(Clone, Copy, Debug, Hash, Eq, PartialEq)]")?;
let enum_name = rust_type_name(name);
writeln!(self.wtr, "pub enum {} {{", enum_name)?;
for variant in variants {
self.wtr.write_str(&format!("{}, ", rust_type_name(variant)))?;
}
writeln!(self.wtr, "}}\n")?;

let mut map = BTreeMap::new();
for (variant, ref set) in enum_map.iter() {
map.extend(set.iter().cloned().map(|cp| (cp, variant)));
}
let ranges = util::to_range_values(
map.iter().map(|(&k, &v)| (k, rust_type_name(v))));
self.ranges_to_enum_slice(name, &enum_name, &ranges)?;
self.wtr.flush()?;
Ok(())
}

fn ranges_to_enum_slice<S>(
&mut self,
name: &str,
enum_ty: &str,
table: &[(u32, u32, S)],
) -> Result<()>
where S: fmt::Display
{
let cp_ty = self.rust_codepoint_type();

writeln!(
self.wtr,
"pub const {}: &'static [({}, {}, {})] = &[",
name, cp_ty, cp_ty, enum_ty)?;
for (start, end, variant) in table {
let range = (self.rust_codepoint(*start), self.rust_codepoint(*end));
if let (Some(start), Some(end)) = range {
let src = format!("({}, {}, {}::{}), ", start, end, enum_ty, variant);
self.wtr.write_str(&src)?;
}
}
writeln!(self.wtr, "];")?;
Ok(())
}

/// Write a map that associates ranges of codepoints with an arbitrary
/// integer.
///
Expand Down Expand Up @@ -1053,6 +1112,21 @@ fn rust_const_name(s: &str) -> String {
s
}

/// Heuristically produce an appropriate Rust type name.
fn rust_type_name(s: &str) -> String {
// Convert to PascalCase
s.split(|c: char| c.is_whitespace() || c == '.' || c == '_' || c == '-')
.map(|component| {
// Upper first char
let lower = component.to_ascii_lowercase();
let mut chars = lower.chars();
match chars.next() {
None => String::new(),
Some(f) => f.to_uppercase().collect::<String>() + chars.as_str(),
}
}).collect()
}

/// Heuristically produce an appropriate module Rust name.
fn rust_module_name(s: &str) -> String {
// Property names/values seem pretty uniform, particularly the
Expand Down Expand Up @@ -1118,7 +1192,7 @@ fn smallest_unsigned_type(n: u64) -> &'static str {

#[cfg(test)]
mod tests {
use super::pack_str;
use super::{pack_str, rust_type_name};

fn unpack_str(mut encoded: u64) -> String {
let mut value = String::new();
Expand All @@ -1140,4 +1214,13 @@ mod tests {
assert!(pack_str("ABCDEFGHI").is_err());
assert!(pack_str("AB\x00CD").is_err());
}

#[test]
fn test_rust_type_name() {
assert_eq!(&rust_type_name("SCRIPT"), "Script");
assert_eq!(&rust_type_name("dot.separated"), "DotSeparated");
assert_eq!(&rust_type_name("dash-separated"), "DashSeparated");
assert_eq!(&rust_type_name("white \tspace"), "WhiteSpace");
assert_eq!(&rust_type_name("snake_case"), "SnakeCase");
}
}