Skip to content

Commit

Permalink
cli: remove the 'dfa' and 'regex' sub-commands
Browse files Browse the repository at this point in the history
These have been moved to regex-cli and now use regex-automata 0.3:
https://github.com/rust-lang/regex/blob/master/regex-cli/README.md#example-serialize-a-dfa

This also breaks the cyclic dependency where updating to a new Unicode
version for bstr required the following:

* Run ucd-generate to update regex-syntax tables.
* Publish new regex-syntax.
* Update ucd-generate lockfile to bring in new regex-syntax.
* Build new ucd-generate binary.
* Run ucd-generate to update bstr regexes.

Namely, that last step requires updating regex-syntax in order to
propagate the Unicode updates into the regex engine.

The new process is:

* Run ucd-generate to update regex-syntax tables.
* Build regex-cli (also in the regex crate repo).
* Run regex-cli to update bstr regexes.

So now we don't have to do this weird dance where we loop back around to
build a new version of ucd-generate.

ucd-generate does still depend on `regex` at the moment via
`ucd-parse`, but this doesn't need updating when a new version of
Unicode comes out. Still, I'm going to explore breaking that dependency
as well via `regex-lite`.

ucd-generate also still depends on `ucd-util` which also has Unicode
data embedded into it. I'm going to look into fixing that by requiring
the caller to pass in the data tables.

Fixes #11
  • Loading branch information
BurntSushi committed Jul 7, 2023
1 parent 59ccef4 commit cf7f4f0
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 544 deletions.
44 changes: 28 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ path = "src/main.rs"
name = "ucd-generate"

[dependencies]
byteorder = "1"
fst = "0.4.0"
regex-automata = "0.1.9"
ucd-parse = { version = "0.1.10", path = "ucd-parse" }
ucd-trie = { version = "0.1.5", path = "ucd-trie" }
ucd-util = { version = "0.1.9", path = "ucd-util" }
Expand Down
5 changes: 1 addition & 4 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@ extern crate test;

use std::cmp::Ordering;

use byteorder::{BigEndian as BE, ByteOrder};
use test::Bencher;

mod tables;

fn u32_key(cp: u32) -> [u8; 4] {
let mut key = [0; 4];
BE::write_u32(&mut key, cp);
key
cp.to_be_bytes()
}

#[bench]
Expand Down
58 changes: 0 additions & 58 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,6 @@ sentence-break emits the table of property values and their corresponding
codepoints for the Sentence_Break property.
";

const ABOUT_DFA: &'static str = "\
dfa emits a single serialized DFAs from an arbitrary regular expression. If
you want a regular expression for finding the start and end of a match, then
use the 'regex' sub-command. Otherwise, if you only care about the end of a
match (forward DFA, the default) or the start of a match (reverse DFA), then
only a single DFA is necessary.
";

const ABOUT_REGEX: &'static str = "\
regex emits serialized DFAs from arbitrary regular expressions.
";

/// Build a clap application.
pub fn app() -> App<'static, 'static> {
// Various common flags and arguments.
Expand Down Expand Up @@ -637,50 +625,6 @@ pub fn app() -> App<'static, 'static> {
.long("enum")
.help("Emit a single table that maps codepoints to values."),
);
let cmd_dfa = SubCommand::with_name("dfa")
.author(clap::crate_authors!())
.version(clap::crate_version!())
.template(TEMPLATE_SUB)
.about("Serialize a single DFAs")
.before_help(ABOUT_DFA)
.arg(Arg::with_name("dfa-dir").help("Emit DFAs to this directory"))
.arg(Arg::with_name("pattern"))
.arg(flag_name("DFA"))
.arg(Arg::with_name("sparse").long("sparse"))
.arg(Arg::with_name("anchored").long("anchored"))
.arg(Arg::with_name("minimize").long("minimize"))
.arg(Arg::with_name("classes").long("classes"))
.arg(Arg::with_name("premultiply").long("premultiply"))
.arg(Arg::with_name("no-utf8").long("no-utf8"))
.arg(Arg::with_name("longest").long("longest"))
.arg(Arg::with_name("reverse").long("reverse"))
.arg(
Arg::with_name("state-size")
.long("state-size")
.possible_values(&["1", "2", "4", "8"])
.default_value("4"),
);
let cmd_regex = SubCommand::with_name("regex")
.author(clap::crate_authors!())
.version(clap::crate_version!())
.template(TEMPLATE_SUB)
.about("Serialize regular expression DFAs.")
.before_help(ABOUT_REGEX)
.arg(Arg::with_name("dfa-dir").help("Emit DFAs to this directory"))
.arg(Arg::with_name("pattern"))
.arg(flag_name("REGEX"))
.arg(Arg::with_name("sparse").long("sparse"))
.arg(Arg::with_name("anchored").long("anchored"))
.arg(Arg::with_name("minimize").long("minimize"))
.arg(Arg::with_name("classes").long("classes"))
.arg(Arg::with_name("premultiply").long("premultiply"))
.arg(Arg::with_name("no-utf8").long("no-utf8"))
.arg(
Arg::with_name("state-size")
.long("state-size")
.possible_values(&["1", "2", "4", "8"])
.default_value("4"),
);

let cmd_test_unicode_data = SubCommand::with_name("test-unicode-data")
.author(clap::crate_authors!())
Expand Down Expand Up @@ -717,7 +661,5 @@ pub fn app() -> App<'static, 'static> {
.subcommand(cmd_grapheme_cluster_break)
.subcommand(cmd_word_break)
.subcommand(cmd_sentence_break)
.subcommand(cmd_dfa)
.subcommand(cmd_regex)
.subcommand(cmd_test_unicode_data)
}
16 changes: 0 additions & 16 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ impl<'a> ArgMatches<'a> {
.columns(79)
.char_literals(self.is_present("chars"))
.trie_set(self.is_present("trie-set"));
if let Some(p) = self.value_of_os("dfa-dir") {
return builder.from_dfa_dir(p);
}
// Some of the functionality of this crate works with a partial ucd
// directory.
match ucd_parse::ucd_directory_version(self.ucd_dir()?) {
Expand All @@ -52,19 +49,6 @@ impl<'a> ArgMatches<'a> {
}
}

pub fn dfa_writer(&self, name: &str) -> Result<Writer> {
let mut builder = WriterBuilder::new(name);
builder
.columns(79)
.char_literals(self.is_present("chars"))
.trie_set(self.is_present("trie-set"));
if let Some(p) = self.value_of_os("dfa-dir") {
builder.from_dfa_dir(p)
} else {
err!("missing DFA directory")
}
}

pub fn name(&self) -> &str {
self.value_of("name").expect("the name of the table")
}
Expand Down
6 changes: 0 additions & 6 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,3 @@ impl From<ucd_trie::Error> for Error {
Error::Other(err.to_string())
}
}

impl From<regex_automata::Error> for Error {
fn from(err: regex_automata::Error) -> Error {
Error::Other(err.to_string())
}
}
3 changes: 0 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ mod jamo_short_name;
mod joining_type;
mod names;
mod property_bool;
mod regex;
mod script;

fn main() {
Expand Down Expand Up @@ -85,8 +84,6 @@ fn run() -> Result<()> {
}
("word-break", Some(m)) => brk::word(ArgMatches::new(m)),
("sentence-break", Some(m)) => brk::sentence(ArgMatches::new(m)),
("dfa", Some(m)) => regex::command_dfa(ArgMatches::new(m)),
("regex", Some(m)) => regex::command_regex(ArgMatches::new(m)),
("test-unicode-data", Some(m)) => {
cmd_test_unicode_data(ArgMatches::new(m))
}
Expand Down
Loading

0 comments on commit cf7f4f0

Please sign in to comment.