Skip to content

Commit

Permalink
Change data model for IANA-to-BCP47 to use ZeroTrie
Browse files Browse the repository at this point in the history
  • Loading branch information
sffc committed Sep 12, 2023
1 parent 3627f8b commit 7d2d80e
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 23 deletions.
3 changes: 2 additions & 1 deletion components/timezone/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ icu_calendar = { workspace = true }
icu_locid = { workspace = true }
icu_provider = { workspace = true, features = ["macros"] }
tinystr = { workspace = true, features = ["alloc", "zerovec"] }
zerotrie = { workspace = true, features = ["databake", "yoke", "zerofrom"] }
zerovec = { workspace = true, features = ["derive", "yoke"] }

databake = { workspace = true, optional = true, features = ["derive"] }
Expand All @@ -38,6 +39,6 @@ icu = { workspace = true }
[features]
default = ["compiled_data"]
std = ["icu_calendar/std", "icu_locid/std", "icu_provider/std"]
serde = ["dep:serde", "zerovec/serde", "tinystr/serde", "icu_provider/serde"]
serde = ["dep:serde", "zerovec/serde", "zerotrie/serde", "tinystr/serde", "icu_provider/serde"]
datagen = ["serde", "dep:databake", "zerovec/databake", "tinystr/databake"]
compiled_data = ["dep:icu_timezone_data"]
11 changes: 4 additions & 7 deletions components/timezone/src/iana_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,8 @@ impl<'a> IanaToBcp47MapperBorrowed<'a> {
///
/// See examples in [`IanaToBcp47Mapper`].
pub fn get_strict(&self, iana_id: &str) -> Option<TimeZoneBcp47Id> {
self.data
.map
.get_copied(NormalizedTimeZoneIdStr::from_str(iana_id))
let idx = self.data.map.get(iana_id)?;
self.data.bcp47_ids.get(idx)
}

/// Looks up a BCP-47 time zone identifier based on an ASCII-case-insensitive match for
Expand All @@ -105,10 +104,8 @@ impl<'a> IanaToBcp47MapperBorrowed<'a> {
/// See examples in [`IanaToBcp47Mapper`].
///
/// [ECMAScript Temporal]: https://tc39.es/proposal-temporal/#sec-isavailabletimezonename
pub fn get_loose(&self, iana_id: &str) -> Option<TimeZoneBcp47Id> {
self.data
.map
.get_copied_by(|probe| probe.cmp_loose(NormalizedTimeZoneIdStr::from_str(iana_id)))
pub fn get_loose(&self, _iana_id: &str) -> Option<TimeZoneBcp47Id> {
unimplemented!()
}
}

Expand Down
11 changes: 8 additions & 3 deletions components/timezone/src/provider/names.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ use icu_provider::prelude::*;

use crate::TimeZoneBcp47Id;
use tinystr::UnvalidatedTinyAsciiStr;
use zerotrie::ZeroTrie;
use zerovec::ule::{UnvalidatedStr, VarULE};
use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap};
use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap, ZeroVec};

/// This is a time zone identifier that can be "loose matched" as according to
/// [ECMAScript Temporal](https://tc39.es/proposal-temporal/#sec-isavailabletimezonename)
Expand Down Expand Up @@ -181,9 +182,13 @@ impl NormalizedTimeZoneIdStr {
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct IanaToBcp47MapV1<'data> {
/// A map from IANA time zone identifiers to BCP-47 time zone identifiers
/// A map from IANA time zone identifiers to indexes of BCP-47 time zone identifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroMap<'data, NormalizedTimeZoneIdStr, TimeZoneBcp47Id>,
pub map: ZeroTrie<ZeroVec<'data, u8>>,
/// A sorted list of BCP-47 time zone identifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
// Note: this is 9739B as ZeroVec<TinyStr8> and 9335B as VarZeroVec<str>
pub bcp47_ids: ZeroVec<'data, TimeZoneBcp47Id>,
}

/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers.
Expand Down
1 change: 1 addition & 0 deletions provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ icu_provider = { workspace = true, features = ["std", "logging", "datagen"]}
icu_provider_adapters = { workspace = true }
tinystr = { workspace = true, features = ["alloc", "serde", "zerovec"] }
writeable = { workspace = true }
zerotrie = { workspace = true, features = ["alloc"] }
zerovec = { workspace = true, features = ["serde", "yoke"] }

# Exporters
Expand Down
47 changes: 35 additions & 12 deletions provider/datagen/src/transform/cldr/time_zones/names.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,47 @@

use super::convert::compute_bcp47_tzids_btreemap;
use crate::transform::cldr::cldr_serde;
use icu_provider::prelude::*;
use icu_provider::datagen::IterableDataProvider;
use icu_provider::prelude::*;
use icu_timezone::provider::names::*;
use icu_timezone::TimeZoneBcp47Id;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use zerotrie::ZeroTriePerfectHash;
use zerovec::ZeroVec;

impl DataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<IanaToBcp47MapV1Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
self.cldr()?
.bcp47()
.read_and_parse("timezone.json")?;
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);
self.cldr()?.bcp47().read_and_parse("timezone.json")?;

let iana2bcp = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);

// Sort and deduplicate the BCP-47 IDs:
let bcp_set: BTreeSet<TimeZoneBcp47Id> = iana2bcp.values().copied().collect();
let bcp_zerovec: ZeroVec<TimeZoneBcp47Id> = bcp_set.iter().copied().collect();

// Transform the map to use BCP indices:
#[allow(clippy::unwrap_used)] // structures are derived from each other
let map: BTreeMap<Vec<u8>, usize> = iana2bcp
.iter()
.map(|(iana, bcp)| {
(
iana.as_bytes().to_vec(),
bcp_zerovec.binary_search(bcp).unwrap(),
)
})
.collect();

let data_struct = IanaToBcp47MapV1 {
map: bcp47_tzid_data
.iter()
.map(|(k, v)| (NormalizedTimeZoneIdStr::boxed_from_bytes(k.as_bytes()), v))
.collect(),
map: ZeroTriePerfectHash::try_from(&map)
.map_err(|e| {
DataError::custom("Could not create ZeroTrie from timezone.json data")
.with_display_context(&e)
})?
.cast_store()
.into_zerotrie(),
bcp47_ids: bcp_zerovec,
};
Ok(DataResponse {
metadata: Default::default(),
Expand All @@ -37,9 +62,7 @@ impl IterableDataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
impl DataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<Bcp47ToIanaMapV1Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
self.cldr()?
.bcp47()
.read_and_parse("timezone.json")?;
self.cldr()?.bcp47().read_and_parse("timezone.json")?;
// Note: The BTreeMap retains the order of the aliases, which is important for establishing
// the canonical order of the IANA names.
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);
Expand Down

0 comments on commit 7d2d80e

Please sign in to comment.