Skip to content

Commit

Permalink
perf: Optimise string inference (#14)
Browse files Browse the repository at this point in the history
* perf: string inference runs fewer expensive tests

* make matching nicer to work with
  • Loading branch information
hgrsd authored Jul 4, 2024
1 parent 755d544 commit daefbb6
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "drivel"
description = "Infer a schema from JSON input, and generate synthetic data based on the inferred schema."
license = "MIT"
authors = ["Daniël Hogers <daniel@hgrsd.nl>"]
version = "0.2.1"
version = "0.2.2"
edition = "2021"
repository = "https://github.com/hgrsd/drivel"

Expand Down
75 changes: 55 additions & 20 deletions src/infer_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,62 @@ lazy_static! {
regex::Regex::new(r"[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]{2,}$").unwrap();
}

pub(crate) fn infer_string_type(s: &str) -> StringType {
if ISO_DATE_REGEX.is_match(s) {
StringType::IsoDate
} else if chrono::DateTime::parse_from_rfc2822(s).is_ok() {
StringType::DateTimeISO8601
} else if chrono::DateTime::parse_from_rfc3339(s).is_ok() {
StringType::DateTimeISO8601
} else if UUIDREGEX.is_match(s) {
StringType::UUID
} else if EMAIL_REGEX.is_match(s) {
StringType::Email
} else if url::Url::parse(s).is_ok() {
StringType::Url
} else if HOSTNAME_REGEX.is_match(s) {
StringType::Hostname
fn uuid(s: &str) -> Option<StringType> {
if s.len() == 36 && UUIDREGEX.is_match(s) {
Some(StringType::UUID)
} else {
None
}
}

fn email(s: &str) -> Option<StringType> {
if s.contains('@') && EMAIL_REGEX.is_match(s) {
Some(StringType::Email)
} else {
StringType::Unknown {
strings_seen: vec![s.to_owned()],
chars_seen: s.chars().collect(),
min_length: Some(s.len()),
max_length: Some(s.len()),
None
}
}

fn url_host(s: &str) -> Option<StringType> {
if s.contains('.') {
if url::Url::parse(s).is_ok() {
return Some(StringType::Url);
}
if HOSTNAME_REGEX.is_match(s) {
return Some(StringType::Hostname);
}
}
None
}

fn dates(s: &str) -> Option<StringType> {
if s.chars().take(1).all(|char| char.is_numeric()) {
if ISO_DATE_REGEX.is_match(s) {
return Some(StringType::IsoDate);
}
if chrono::DateTime::parse_from_rfc3339(s).is_ok() {
return Some(StringType::DateTimeISO8601);
}
}

if chrono::DateTime::parse_from_rfc2822(s).is_ok() {
return Some(StringType::DateTimeISO8601);
}

None
}

pub(crate) fn infer_string_type(s: &str) -> StringType {
for matcher in [uuid, email, url_host, dates] {
if let Some(string_type) = matcher(s) {
return string_type;
}
}

return StringType::Unknown {
strings_seen: vec![s.to_owned()],
chars_seen: s.chars().collect(),
min_length: Some(s.len()),
max_length: Some(s.len()),
};
}

0 comments on commit daefbb6

Please sign in to comment.