cli: remove the 'dfa' and 'regex' sub-commands

These have been moved to regex-cli and now use regex-automata 0.3: https://github.com/rust-lang/regex/blob/master/regex-cli/README.md#example-serialize-a-dfa This also breaks the cyclic dependency where updating to a new Unicode version for bstr required the following: * Run ucd-generate to update regex-syntax tables. * Publish new regex-syntax. * Update ucd-generate lockfile to bring in new regex-syntax. * Build new ucd-generate binary. * Run ucd-generate to update bstr regexes. Namely, that last step requires updating regex-syntax in order to propagate the Unicode updates into the regex engine. The new process is: * Run ucd-generate to update regex-syntax tables. * Build regex-cli (also in the regex crate repo). * Run regex-cli to update bstr regexes. So now we don't have to do this weird dance where we loop back around to build a new version of ucd-generate. ucd-generate does still depend on `regex` at the moment via `ucd-parse`, but this doesn't need updating when a new version of Unicode comes out. Still, I'm going to explore breaking that dependency as well via `regex-lite`. ucd-generate also still depends on `ucd-util` which also has Unicode data embedded into it. I'm going to look into fixing that by requiring the caller to pass in the data tables. Fixes #11
BurntSushi · Jul 7, 2023 · cf7f4f0 · cf7f4f0
1 parent 59ccef4
commit cf7f4f0
Show file tree

Hide file tree

Showing 9 changed files with 30 additions and 544 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,9 +24,7 @@ path = "src/main.rs"
 name = "ucd-generate"
 
 [dependencies]
-byteorder = "1"
 fst = "0.4.0"
-regex-automata = "0.1.9"
 ucd-parse = { version = "0.1.10", path = "ucd-parse" }
 ucd-trie = { version = "0.1.5", path = "ucd-trie" }
 ucd-util = { version = "0.1.9", path = "ucd-util"  }

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -4,15 +4,12 @@ extern crate test;
 
 use std::cmp::Ordering;
 
-use byteorder::{BigEndian as BE, ByteOrder};
 use test::Bencher;
 
 mod tables;
 
 fn u32_key(cp: u32) -> [u8; 4] {
-    let mut key = [0; 4];
-    BE::write_u32(&mut key, cp);
-    key
+    cp.to_be_bytes()
 }
 
 #[bench]

diff --git a/src/app.rs b/src/app.rs
@@ -163,18 +163,6 @@ sentence-break emits the table of property values and their corresponding
 codepoints for the Sentence_Break property.
 ";
 
-const ABOUT_DFA: &'static str = "\
-dfa emits a single serialized DFAs from an arbitrary regular expression. If
-you want a regular expression for finding the start and end of a match, then
-use the 'regex' sub-command. Otherwise, if you only care about the end of a
-match (forward DFA, the default) or the start of a match (reverse DFA), then
-only a single DFA is necessary.
-";
-
-const ABOUT_REGEX: &'static str = "\
-regex emits serialized DFAs from arbitrary regular expressions.
-";
-
 /// Build a clap application.
 pub fn app() -> App<'static, 'static> {
     // Various common flags and arguments.
@@ -637,50 +625,6 @@ pub fn app() -> App<'static, 'static> {
                 .long("enum")
                 .help("Emit a single table that maps codepoints to values."),
         );
-    let cmd_dfa = SubCommand::with_name("dfa")
-        .author(clap::crate_authors!())
-        .version(clap::crate_version!())
-        .template(TEMPLATE_SUB)
-        .about("Serialize a single DFAs")
-        .before_help(ABOUT_DFA)
-        .arg(Arg::with_name("dfa-dir").help("Emit DFAs to this directory"))
-        .arg(Arg::with_name("pattern"))
-        .arg(flag_name("DFA"))
-        .arg(Arg::with_name("sparse").long("sparse"))
-        .arg(Arg::with_name("anchored").long("anchored"))
-        .arg(Arg::with_name("minimize").long("minimize"))
-        .arg(Arg::with_name("classes").long("classes"))
-        .arg(Arg::with_name("premultiply").long("premultiply"))
-        .arg(Arg::with_name("no-utf8").long("no-utf8"))
-        .arg(Arg::with_name("longest").long("longest"))
-        .arg(Arg::with_name("reverse").long("reverse"))
-        .arg(
-            Arg::with_name("state-size")
-                .long("state-size")
-                .possible_values(&["1", "2", "4", "8"])
-                .default_value("4"),
-        );
-    let cmd_regex = SubCommand::with_name("regex")
-        .author(clap::crate_authors!())
-        .version(clap::crate_version!())
-        .template(TEMPLATE_SUB)
-        .about("Serialize regular expression DFAs.")
-        .before_help(ABOUT_REGEX)
-        .arg(Arg::with_name("dfa-dir").help("Emit DFAs to this directory"))
-        .arg(Arg::with_name("pattern"))
-        .arg(flag_name("REGEX"))
-        .arg(Arg::with_name("sparse").long("sparse"))
-        .arg(Arg::with_name("anchored").long("anchored"))
-        .arg(Arg::with_name("minimize").long("minimize"))
-        .arg(Arg::with_name("classes").long("classes"))
-        .arg(Arg::with_name("premultiply").long("premultiply"))
-        .arg(Arg::with_name("no-utf8").long("no-utf8"))
-        .arg(
-            Arg::with_name("state-size")
-                .long("state-size")
-                .possible_values(&["1", "2", "4", "8"])
-                .default_value("4"),
-        );
 
     let cmd_test_unicode_data = SubCommand::with_name("test-unicode-data")
         .author(clap::crate_authors!())
@@ -717,7 +661,5 @@ pub fn app() -> App<'static, 'static> {
         .subcommand(cmd_grapheme_cluster_break)
         .subcommand(cmd_word_break)
         .subcommand(cmd_sentence_break)
-        .subcommand(cmd_dfa)
-        .subcommand(cmd_regex)
         .subcommand(cmd_test_unicode_data)
 }
diff --git a/src/args.rs b/src/args.rs
@@ -35,9 +35,6 @@ impl<'a> ArgMatches<'a> {
             .columns(79)
             .char_literals(self.is_present("chars"))
             .trie_set(self.is_present("trie-set"));
-        if let Some(p) = self.value_of_os("dfa-dir") {
-            return builder.from_dfa_dir(p);
-        }
         // Some of the functionality of this crate works with a partial ucd
         // directory.
         match ucd_parse::ucd_directory_version(self.ucd_dir()?) {
@@ -52,19 +49,6 @@ impl<'a> ArgMatches<'a> {
         }
     }
 
-    pub fn dfa_writer(&self, name: &str) -> Result<Writer> {
-        let mut builder = WriterBuilder::new(name);
-        builder
-            .columns(79)
-            .char_literals(self.is_present("chars"))
-            .trie_set(self.is_present("trie-set"));
-        if let Some(p) = self.value_of_os("dfa-dir") {
-            builder.from_dfa_dir(p)
-        } else {
-            err!("missing DFA directory")
-        }
-    }
-
     pub fn name(&self) -> &str {
         self.value_of("name").expect("the name of the table")
     }

diff --git a/src/error.rs b/src/error.rs
@@ -75,9 +75,3 @@ impl From<ucd_trie::Error> for Error {
         Error::Other(err.to_string())
     }
 }
-
-impl From<regex_automata::Error> for Error {
-    fn from(err: regex_automata::Error) -> Error {
-        Error::Other(err.to_string())
-    }
-}
diff --git a/src/main.rs b/src/main.rs
@@ -30,7 +30,6 @@ mod jamo_short_name;
 mod joining_type;
 mod names;
 mod property_bool;
-mod regex;
 mod script;
 
 fn main() {
@@ -85,8 +84,6 @@ fn run() -> Result<()> {
         }
         ("word-break", Some(m)) => brk::word(ArgMatches::new(m)),
         ("sentence-break", Some(m)) => brk::sentence(ArgMatches::new(m)),
-        ("dfa", Some(m)) => regex::command_dfa(ArgMatches::new(m)),
-        ("regex", Some(m)) => regex::command_regex(ArgMatches::new(m)),
         ("test-unicode-data", Some(m)) => {
             cmd_test_unicode_data(ArgMatches::new(m))
         }