diff --git a/components/properties/src/lib.rs b/components/properties/src/lib.rs index 9b48d813bde..6c1f6aa485c 100644 --- a/components/properties/src/lib.rs +++ b/components/properties/src/lib.rs @@ -74,6 +74,7 @@ mod error; pub mod maps; mod props; pub mod provider; +pub mod script; pub mod sets; mod trievalue; mod ule; diff --git a/components/properties/src/maps.rs b/components/properties/src/maps.rs index 09a5deaea7a..fe991a26b4d 100644 --- a/components/properties/src/maps.rs +++ b/components/properties/src/maps.rs @@ -98,7 +98,7 @@ where } /// Return a [`CodePointTrie`] for the East_Asian_Width Unicode enumerated -/// property. See [`East_Asian_Width`]. +/// property. See [`EastAsianWidth`]. /// /// # Example /// diff --git a/components/properties/src/provider.rs b/components/properties/src/provider.rs index 514407afac1..14b504de8bc 100644 --- a/components/properties/src/provider.rs +++ b/components/properties/src/provider.rs @@ -6,6 +6,7 @@ //! //! Read more about data providers: [`icu_provider`] +use crate::script::ScriptExtensions; use icu_codepointtrie::{CodePointTrie, TrieValue}; use icu_provider::yoke::{self, *}; use icu_uniset::UnicodeSet; @@ -331,6 +332,16 @@ pub mod key { (SENTENCE_BREAK_V1, "SB"), ); + + define_resource_keys!(ALL_SCRIPT_EXTENSIONS_KEYS; 1; + // + // Script_Extensions + Script data + // + + // ResourceKey subcategory string is the short alias of Script_Extensions + + (SCRIPT_EXTENSIONS_V1, "scx"), + ); } // @@ -408,3 +419,20 @@ pub struct UnicodePropertyMapV1Marker { impl icu_provider::DataMarker for UnicodePropertyMapV1Marker { type Yokeable = UnicodePropertyMapV1<'static, T>; } + +// +// Script_Extensions +// + +/// A data structure efficiently storing `Script` and `Script_Extensions` property data. +#[icu_provider::data_struct] +#[derive(Debug, Eq, PartialEq)] +#[cfg_attr( + feature = "provider_serde", + derive(serde::Serialize, serde::Deserialize) +)] +pub struct ScriptExtensionsPropertyV1<'data> { + /// A special data structure for `Script` and `Script_Extensions`. + #[cfg_attr(feature = "provider_serde", serde(borrow))] + pub data: ScriptExtensions<'data>, +} diff --git a/components/properties/src/script.rs b/components/properties/src/script.rs new file mode 100644 index 00000000000..4412d2b9055 --- /dev/null +++ b/components/properties/src/script.rs @@ -0,0 +1,193 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Data and APIs for supporting both Script and Script_Extensions property +//! values in an efficient structure. + +use crate::error::PropertiesError; +use crate::props::Script; + +use icu_codepointtrie::{CodePointTrie, TrieValue}; +use icu_provider::yoke::{self, *}; +use zerovec::{VarZeroVec, ZeroSlice}; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +const SCRIPT_X_SCRIPT_VAL: u16 = 0x03FF; +const SCRIPT_VAL_LENGTH: u16 = 10; + +/// An internal-use only pseudo-property that represents the values stored in +/// the trie of the special data structure [`ScriptExtensions`]. +/// +/// Note: The will assume a 12-bit layout. The 2 higher order bits in positions +/// 11..10 will indicate how to deduce the Script value and Script_Extensions, +/// and the lower 10 bits 9..0 indicate either the Script value or the index +/// into the `extensions` structure. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[repr(transparent)] +pub struct ScriptWithExt(pub u16); + +#[allow(missing_docs)] // These constants don't need individual documentation. +#[allow(non_upper_case_globals)] +impl ScriptWithExt { + pub const Unknown: ScriptWithExt = ScriptWithExt(0); +} + +impl ScriptWithExt { + pub fn is_common(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 1 + } + + pub fn is_inherited(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 2 + } + + pub fn is_other(&self) -> bool { + self.0 >> SCRIPT_VAL_LENGTH == 3 + } +} + +/// A data structure that represents the data for both Script and +/// Script_Extensions properties in an efficient way. This structure matches +/// the data and data structures that are stored in the corresponding ICU data +/// file for these properties. +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)] +pub struct ScriptExtensions<'data> { + /// Note: The `ScriptWithExt` values in this array will assume a 12-bit layout. The 2 + /// higher order bits 11..10 will indicate how to deduce the Script value and + /// Script_Extensions value, nearly matching the representation + /// [in ICU](https://github.com/unicode-org/icu/blob/main/icu4c/source/common/uprops.h): + /// + /// | High order 2 bits value | Script | Script_Extensions | + /// |-------------------------|--------------------------------------------------------|----------------------------------------------------------------| + /// | 3 | First value in sub-array, index given by lower 10 bits | Sub-array excluding first value, index given by lower 10 bits | + /// | 2 | Script=Inherited | Entire sub-array, index given by lower 10 bits | + /// | 1 | Script=Common | Entire sub-array, index given by lower 10 bits | + /// | 0 | Value in lower 10 bits | `[ Script value ]` single-element array | + /// + /// When the lower 10 bits of the value are used as an index, that index is + /// used for the outer-level vector of the nested `extensions` structure. + #[cfg_attr(feature = "serde", serde(borrow))] + trie: CodePointTrie<'data, ScriptWithExt>, + + /// This companion structure stores Script_Extensions values, which are + /// themselves arrays / vectors. This structure only stores the values for + /// cases in which `scx(cp) != [ sc(cp) ]`. Each sub-vector is distinct. The + /// sub-vector represents the Script_Extensions array value for a code point, + /// and may also indicate Script value, as described for the `trie` field. + #[cfg_attr(feature = "serde", serde(borrow))] + extensions: VarZeroVec<'data, ZeroSlice