diff --git a/Cargo.lock b/Cargo.lock index e99a420bf4c..a1e852d2407 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1721,7 +1721,7 @@ dependencies = [ [[package]] name = "icu_datagen" -version = "1.2.0" +version = "1.2.1" dependencies = [ "cached-path", "clap 4.0.32", diff --git a/provider/datagen/Cargo.toml b/provider/datagen/Cargo.toml index 816c1c43498..9ca02e3707b 100644 --- a/provider/datagen/Cargo.toml +++ b/provider/datagen/Cargo.toml @@ -5,7 +5,7 @@ [package] name = "icu_datagen" description = "Generate data for ICU4X DataProvider" -version = "1.2.0" +version = "1.2.1" authors = ["The ICU4X Project Developers"] edition = "2021" readme = "README.md" @@ -105,7 +105,7 @@ use_wasm = ["icu_codepointtrie_builder/wasm"] # If neither `use_wasm` nor `use_icu4c` are enabled, # rule based segmenter data will not be generated. use_icu4c = ["icu_codepointtrie_builder/icu4c"] -networking = ["cached-path"] +networking = ["dep:cached-path"] [[bin]] name = "icu4x-datagen" diff --git a/provider/datagen/src/source.rs b/provider/datagen/src/source.rs index 014d4d4ca80..e79644fe7a7 100644 --- a/provider/datagen/src/source.rs +++ b/provider/datagen/src/source.rs @@ -11,7 +11,6 @@ use std::collections::HashSet; use std::fmt::Debug; use std::io::Cursor; use std::io::Read; -use std::ops::Deref; use std::path::Path; use std::path::PathBuf; use std::sync::Arc; @@ -61,6 +60,9 @@ impl SourceData { /// The latest `SourceData` that has been verified to work with this version of `icu_datagen`. /// /// See [`SourceData::LATEST_TESTED_CLDR_TAG`] and [`SourceData::LATEST_TESTED_ICUEXPORT_TAG`]. + /// + /// Requires `networking` Cargo feature. + #[cfg(feature = "networking")] pub fn latest_tested() -> Self { Self::default() .with_cldr_for_tag(Self::LATEST_TESTED_CLDR_TAG, Default::default()) @@ -110,6 +112,9 @@ impl SourceData { /// using the given tag (see [GitHub releases](https://github.com/unicode-org/cldr-json/releases)). /// /// Also see: [`LATEST_TESTED_CLDR_TAG`](Self::LATEST_TESTED_CLDR_TAG) + /// + /// Requires `networking` Cargo feature. + #[cfg(feature = "networking")] pub fn with_cldr_for_tag( self, tag: &str, @@ -128,6 +133,9 @@ impl SourceData { /// using the given tag. (see [GitHub releases](https://github.com/unicode-org/icu/releases)). /// /// Also see: [`LATEST_TESTED_ICUEXPORT_TAG`](Self::LATEST_TESTED_ICUEXPORT_TAG) + /// + /// Requires `networking` Cargo feature. + #[cfg(feature = "networking")] pub fn with_icuexport_for_tag(self, mut tag: &str) -> Result { if tag == "release-71-1" { tag = "icu4x/2022-08-17/71.x"; @@ -147,6 +155,7 @@ impl SourceData { since = "1.1.0", note = "Use `with_cldr_for_tag(SourceData::LATEST_TESTED_CLDR_TAG)`" )] + #[cfg(feature = "networking")] /// Deprecated pub fn with_cldr_latest( self, @@ -159,6 +168,7 @@ impl SourceData { since = "1.1.0", note = "Use `with_icuexport_for_tag(SourceData::LATEST_TESTED_ICUEXPORT_TAG)`" )] + #[cfg(feature = "networking")] /// Deprecated pub fn with_icuexport_latest(self) -> Result { self.with_icuexport_for_tag(Self::LATEST_TESTED_ICUEXPORT_TAG) @@ -239,15 +249,6 @@ pub(crate) enum IcuTrieType { Small, } -impl IcuTrieType { - pub(crate) fn to_internal(self) -> icu_collections::codepointtrie::TrieType { - match self { - IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, - IcuTrieType::Small => icu_collections::codepointtrie::TrieType::Small, - } - } -} - impl std::fmt::Display for IcuTrieType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { match self { @@ -378,48 +379,41 @@ impl AbstractFs { } } + #[cfg(feature = "networking")] fn new_from_url(path: String) -> Self { Self::Zip(RwLock::new(Err(path))) } fn init(&self) -> Result<(), DataError> { - match self { - Self::Zip(lock) => { - if lock.read().expect("poison").is_ok() { - return Ok(()); - } - let mut lock = lock.write().expect("poison"); - let resource = if let Err(resource) = lock.deref() { - resource - } else { - return Ok(()); - }; - - let root: PathBuf = { - #[cfg(not(feature = "networking"))] - unreachable!("AbstractFs URL mode only possible when using CLDR/ICU tags, which cannot be set without the `networking` feature"); - - #[cfg(feature = "networking")] - { - lazy_static::lazy_static! { - static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new() - .freshness_lifetime(u64::MAX) - .progress_bar(None) - .build() - .unwrap(); - } - - CACHE - .cached_path(resource) - .map_err(|e| DataError::custom("Download").with_display_context(&e))? - } - }; - *lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?)) - .map_err(|e| DataError::custom("Zip").with_display_context(&e))?); - Ok(()) + #[cfg(feature = "networking")] + if let Self::Zip(lock) = self { + if lock.read().expect("poison").is_ok() { + return Ok(()); } - _ => Ok(()), + let mut lock = lock.write().expect("poison"); + let resource = if let Err(resource) = &*lock { + resource + } else { + return Ok(()); + }; + + let root = { + lazy_static::lazy_static! { + static ref CACHE: cached_path::Cache = cached_path::CacheBuilder::new() + .freshness_lifetime(u64::MAX) + .progress_bar(None) + .build() + .unwrap(); + } + + CACHE + .cached_path(resource) + .map_err(|e| DataError::custom("Download").with_display_context(&e))? + }; + *lock = Ok(ZipArchive::new(Cursor::new(std::fs::read(root)?)) + .map_err(|e| DataError::custom("Zip").with_display_context(&e))?); } + Ok(()) } fn read_to_buf(&self, path: &str) -> Result, DataError> { diff --git a/provider/datagen/src/transform/segmenter/mod.rs b/provider/datagen/src/transform/segmenter/mod.rs index 4cf63628cc5..54523378834 100644 --- a/provider/datagen/src/transform/segmenter/mod.rs +++ b/provider/datagen/src/transform/segmenter/mod.rs @@ -4,6 +4,10 @@ //! This module contains provider implementations backed by built-in segmentation data. +// Some code gated on icu_codepointtrie_builder features +#![allow(dead_code)] +#![allow(unused_imports)] + use icu_codepointtrie_builder::{CodePointTrieBuilder, CodePointTrieBuilderData}; use icu_collections::codepointtrie::CodePointTrie; use icu_locid::{langid, locale}; @@ -222,6 +226,7 @@ fn is_cjk_fullwidth(eaw: maps::CodePointMapDataBorrowed, codepoi } impl crate::DatagenProvider { + #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] fn generate_rule_break_data( &self, key: DataKey, @@ -583,21 +588,18 @@ impl crate::DatagenProvider { let complex_property = get_index_from_name(&properties_names, "SA").unwrap_or(127); // Generate a CodePointTrie from properties_map - let property_trie: CodePointTrie = { - #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))] - return Err(DataError::custom( - "icu_datagen must be built with use_icu4c or use_wasm to build segmenter data", - )); - - #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] - CodePointTrieBuilder { - data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map), - default_value: 0, - error_value: 0, - trie_type: self.source.trie_type().to_internal(), - } - .build() - }; + let property_trie: CodePointTrie = CodePointTrieBuilder { + data: CodePointTrieBuilderData::ValuesByCodePoint(&properties_map), + default_value: 0, + error_value: 0, + trie_type: match self.source.trie_type() { + crate::source::IcuTrieType::Fast => icu_collections::codepointtrie::TrieType::Fast, + crate::source::IcuTrieType::Small => { + icu_collections::codepointtrie::TrieType::Small + } + }, + } + .build(); if segmenter.segmenter_type == "line" { // Note: The following match statement had been used in line.rs: @@ -648,12 +650,18 @@ impl crate::DatagenProvider { impl DataProvider for crate::DatagenProvider { fn load(&self, _req: DataRequest) -> Result, DataError> { - let break_data = self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?; - - Ok(DataResponse { + #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))] + return Err(DataError::custom( + "icu_datagen must be built with use_icu4c or use_wasm to build segmenter/line@1", + ) + .with_req(LineBreakDataV1Marker::KEY, _req)); + #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] + return Ok(DataResponse { metadata: DataResponseMetadata::default(), - payload: Some(DataPayload::from_owned(break_data)), - }) + payload: Some(DataPayload::from_owned( + self.generate_rule_break_data(LineBreakDataV1Marker::KEY)?, + )), + }); } } @@ -662,23 +670,35 @@ impl DataProvider for crate::DatagenProvider { &self, _req: DataRequest, ) -> Result, DataError> { - let break_data = self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?; - - Ok(DataResponse { + #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))] + return Err(DataError::custom( + "icu_datagen must be built with use_icu4c or use_wasm to build segmenter/grapheme@1", + ) + .with_req(GraphemeClusterBreakDataV1Marker::KEY, _req)); + #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] + return Ok(DataResponse { metadata: DataResponseMetadata::default(), - payload: Some(DataPayload::from_owned(break_data)), - }) + payload: Some(DataPayload::from_owned( + self.generate_rule_break_data(GraphemeClusterBreakDataV1Marker::KEY)?, + )), + }); } } impl DataProvider for crate::DatagenProvider { fn load(&self, _req: DataRequest) -> Result, DataError> { - let break_data = self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?; - - Ok(DataResponse { + #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))] + return Err(DataError::custom( + "icu_datagen must be built with use_icu4c or use_wasm to build segmenter/word@1", + ) + .with_req(WordBreakDataV1Marker::KEY, _req)); + #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] + return Ok(DataResponse { metadata: DataResponseMetadata::default(), - payload: Some(DataPayload::from_owned(break_data)), - }) + payload: Some(DataPayload::from_owned( + self.generate_rule_break_data(WordBreakDataV1Marker::KEY)?, + )), + }); } } @@ -687,12 +707,18 @@ impl DataProvider for crate::DatagenProvider { &self, _req: DataRequest, ) -> Result, DataError> { - let break_data = self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?; - - Ok(DataResponse { + #[cfg(not(any(feature = "use_wasm", feature = "use_icu4c")))] + return Err(DataError::custom( + "icu_datagen must be built with use_icu4c or use_wasm to build segmenter/sentence@1", + ) + .with_req(SentenceBreakDataV1Marker::KEY, _req)); + #[cfg(any(feature = "use_wasm", feature = "use_icu4c"))] + return Ok(DataResponse { metadata: DataResponseMetadata::default(), - payload: Some(DataPayload::from_owned(break_data)), - }) + payload: Some(DataPayload::from_owned( + self.generate_rule_break_data(SentenceBreakDataV1Marker::KEY)?, + )), + }); } }