Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport Joining_Type and UTS 46 data to the 1.4 branch #4926

Merged
merged 5 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# Changelog

## icu4x 1.4.x
- [Expose UTS 46 data via the normalizer](https://github.com/unicode-org/icu4x/pull/4712)
- `icu_normalizer@1.4.2`

- [Implement Joining_Type property](https://github.com/unicode-org/icu4x/pull/4599)
- `icu_properties@1.4.1`

- [Remove icu_datagen's dep on `fractional`](https://github.com/unicode-org/icu4x/pull/4472)
- `icu_datagen@1.4.1`

Expand Down
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion components/normalizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "icu_normalizer"
description = "API for normalizing text into Unicode Normalization Forms"
license-file = "LICENSE"

version = "1.4.1"
version = "1.4.2"
rust-version.workspace = true
authors.workspace = true
edition.workspace = true
Expand Down
148 changes: 91 additions & 57 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ extern crate alloc;
mod error;
pub mod properties;
pub mod provider;
pub mod uts46;

pub use crate::error::NormalizerError;

Expand All @@ -79,7 +80,6 @@ pub use NormalizerError as Error;
use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
use crate::provider::DecompositionDataV1;
#[cfg(feature = "experimental")]
use crate::provider::Uts46DecompositionSupplementV1Marker;
use alloc::string::String;
use alloc::vec::Vec;
Expand All @@ -106,20 +106,30 @@ use zerovec::{zeroslice, ZeroSlice};
#[derive(Debug)]
enum SupplementPayloadHolder {
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
#[cfg(feature = "experimental")]
Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
}

impl SupplementPayloadHolder {
fn get(&self) -> &DecompositionSupplementV1 {
match self {
SupplementPayloadHolder::Compatibility(d) => d.get(),
#[cfg(feature = "experimental")]
SupplementPayloadHolder::Uts46(d) => d.get(),
}
}
}

/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
#[derive(Debug, PartialEq, Eq)]
enum IgnorableBehavior {
/// 0xFFFFFFFF in data is not supported.
Unsupported,
/// Ignorables are ignored.
Ignored,
/// Ignorables are treated as singleton decompositions
/// to the REPLACEMENT CHARACTER.
ReplacementCharacter,
}

/// Number of iterations allowed on the fast path before flushing.
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
/// character, this means two memory pages.
Expand All @@ -132,6 +142,9 @@ impl SupplementPayloadHolder {
/// passes an error through from `Write`.
const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;

/// Marker for UTS 46 ignorables.
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;

/// Marker for starters that decompose to themselves but may
/// combine backwards under canonical composition.
/// (Main trie only; not used in the supplementary trie.)
Expand Down Expand Up @@ -528,6 +541,7 @@ where
/// 1. Decomposes to self.
/// 2. Decomposition starts with a non-starter
decomposition_passthrough_bound: u32, // never above 0xC0
ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
}

impl<'data, I> Decomposition<'data, I>
Expand All @@ -549,7 +563,15 @@ where
decompositions: &'data DecompositionDataV1,
tables: &'data DecompositionTablesV1,
) -> Self {
Self::new_with_supplements(delegate, decompositions, None, tables, None, 0xC0)
Self::new_with_supplements(
delegate,
decompositions,
None,
tables,
None,
0xC0,
IgnorableBehavior::Unsupported,
)
}

/// Constructs a decomposing iterator adapter from a delegate
Expand All @@ -565,6 +587,7 @@ where
tables: &'data DecompositionTablesV1,
supplementary_tables: Option<&'data DecompositionTablesV1>,
decomposition_passthrough_bound: u8,
ignorable_behavior: IgnorableBehavior,
) -> Self {
let half_width_voicing_marks_become_non_starters =
if let Some(supplementary) = supplementary_decompositions {
Expand Down Expand Up @@ -595,6 +618,7 @@ where
},
half_width_voicing_marks_become_non_starters,
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
ignorable_behavior,
};
let _ = ret.next(); // Remove the U+FFFF placeholder
ret
Expand Down Expand Up @@ -721,16 +745,42 @@ where

fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
debug_assert!(self.pending.is_none());
let c = self.delegate.next()?;
loop {
let c = self.delegate.next()?;

// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}
// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}

Some(self.attach_trie_value(c))
if let Some(supplementary) = self.supplementary_trie {
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
if value.trie_val == IGNORABLE_MARKER {
match self.ignorable_behavior {
IgnorableBehavior::Unsupported => {
debug_assert!(false);
}
IgnorableBehavior::ReplacementCharacter => {
return Some(CharacterAndTrieValue::new(
c,
u32::from(REPLACEMENT_CHARACTER),
));
}
IgnorableBehavior::Ignored => {
// Else ignore this character by reading the next one from the delegate.
continue;
}
}
}
return Some(value);
}
}
let trie_val = self.trie.get(c);
debug_assert_ne!(trie_val, IGNORABLE_MARKER);
return Some(CharacterAndTrieValue::new(c, trie_val));
}
}

fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
Expand Down Expand Up @@ -1229,6 +1279,7 @@ macro_rules! composing_normalize_to {
) -> core::fmt::Result {
$prolog
let mut $composition = self.normalize_iter($text.chars());
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
Expand Down Expand Up @@ -1416,6 +1467,7 @@ macro_rules! decomposing_normalize_to {
$prolog

let mut $decomposition = self.normalize_iter($text.chars());
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);

// Try to get the compiler to hoist the bound to a register.
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
Expand Down Expand Up @@ -1730,8 +1782,8 @@ impl DecomposingNormalizer {
}

#[doc(hidden)]
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_decomposed_without_ignored_and_disallowed() -> Self {
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46_decomposed() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
Expand Down Expand Up @@ -1807,8 +1859,7 @@ impl DecomposingNormalizer {
///
/// Public for testing only.
#[doc(hidden)]
#[cfg(feature = "experimental")]
pub fn try_new_uts46_decomposed_without_ignored_and_disallowed_unstable<D>(
pub(crate) fn try_new_uts46_decomposed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
where
Expand Down Expand Up @@ -1872,6 +1923,7 @@ impl DecomposingNormalizer {
self.tables.get(),
self.supplementary_tables.as_ref().map(|s| s.get()),
self.decomposition_passthrough_bound,
IgnorableBehavior::Unsupported,
)
}

Expand Down Expand Up @@ -2241,52 +2293,27 @@ impl ComposingNormalizer {
})
}

/// See [`Self::try_new_uts46_without_ignored_and_disallowed_unstable`].
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_without_ignored_and_disallowed() -> Self {
ComposingNormalizer {
decomposing_normalizer:
DecomposingNormalizer::new_uts46_decomposed_without_ignored_and_disallowed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

/// 🚧 \[Experimental\] UTS 46 constructor
///
/// This is a special building block normalization for IDNA that implements parts of the Map
/// step and the following Normalize step. The caller is responsible for performing the
/// "disallowed", "ignored", and "deviation" parts of the Map step before passing data to
/// this normalizer such that disallowed and ignored characters aren't passed to this
/// normalizer.
///
/// This is ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows
/// and ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as
/// in NFC in this normalization. Making the disallowed characters behave like this is beneficial
/// to data size, and this normalizer implementation cannot deal with a character normalizing
/// to the empty string, which doesn't happen in NFC or NFKC as of Unicode 14.
/// step and the following Normalize step.
///
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
/// Therefore, the output of this normalization may differ for different inputs that are
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
/// to other reorderable characters.
///
/// NOTE: This method remains experimental until suitability of this feature as part of
/// IDNA processing has been demonstrated.
///
/// <div class="stab unstable">
/// 🚧 This code is experimental; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. It can be enabled with the "experimental" Cargo feature
/// of the icu meta-crate. Use with caution.
/// <a href="https://github.com/unicode-org/icu4x/issues/2614">#2614</a>
/// </div>
#[cfg(feature = "experimental")]
pub fn try_new_uts46_without_ignored_and_disallowed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
Expand All @@ -2297,9 +2324,7 @@ impl ComposingNormalizer {
+ ?Sized,
{
let decomposing_normalizer =
DecomposingNormalizer::try_new_uts46_decomposed_without_ignored_and_disallowed_unstable(
provider,
)?;
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;

let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Expand All @@ -2313,6 +2338,14 @@ impl ComposingNormalizer {
/// Wraps a delegate iterator into a composing iterator
/// adapter by using the data already held by this normalizer.
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
}

fn normalize_iter_private<I: Iterator<Item = char>>(
&self,
iter: I,
ignorable_behavior: IgnorableBehavior,
) -> Composition<I> {
Composition::new(
Decomposition::new_with_supplements(
iter,
Expand All @@ -2327,6 +2360,7 @@ impl ComposingNormalizer {
.as_ref()
.map(|s| s.get()),
self.decomposing_normalizer.decomposition_passthrough_bound,
ignorable_behavior,
),
ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
self.decomposing_normalizer.composition_passthrough_bound,
Expand Down
Loading
Loading