Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make UTS 46 normalization non-experimental #4712

Merged
merged 18 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 91 additions & 57 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ extern crate alloc;
mod error;
pub mod properties;
pub mod provider;
pub mod uts46;

pub use crate::error::NormalizerError;

Expand All @@ -79,7 +80,6 @@ pub use NormalizerError as Error;
use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
use crate::provider::DecompositionDataV1;
#[cfg(feature = "experimental")]
use crate::provider::Uts46DecompositionSupplementV1Marker;
use alloc::string::String;
use alloc::vec::Vec;
Expand All @@ -106,20 +106,30 @@ use zerovec::{zeroslice, ZeroSlice};
#[derive(Debug)]
enum SupplementPayloadHolder {
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
#[cfg(feature = "experimental")]
Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
}

impl SupplementPayloadHolder {
fn get(&self) -> &DecompositionSupplementV1 {
match self {
SupplementPayloadHolder::Compatibility(d) => d.get(),
#[cfg(feature = "experimental")]
SupplementPayloadHolder::Uts46(d) => d.get(),
}
}
}

/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
#[derive(Debug, PartialEq, Eq)]
enum IgnorableBehavior {
/// 0xFFFFFFFF in data is not supported.
Unsupported,
/// Ignorables are ignored.
Ignored,
/// Ignorables are treated as singleton decompositions
/// to the REPLACEMENT CHARACTER.
ReplacementCharacter,
}

/// Number of iterations allowed on the fast path before flushing.
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
/// character, this means two memory pages.
Expand All @@ -132,6 +142,9 @@ impl SupplementPayloadHolder {
/// passes an error through from `Write`.
const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;

/// Marker for UTS 46 ignorables.
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;

/// Marker for starters that decompose to themselves but may
/// combine backwards under canonical composition.
/// (Main trie only; not used in the supplementary trie.)
Expand Down Expand Up @@ -528,6 +541,7 @@ where
/// 1. Decomposes to self.
/// 2. Decomposition starts with a non-starter
decomposition_passthrough_bound: u32, // never above 0xC0
ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
}

impl<'data, I> Decomposition<'data, I>
Expand All @@ -549,7 +563,15 @@ where
decompositions: &'data DecompositionDataV1,
tables: &'data DecompositionTablesV1,
) -> Self {
Self::new_with_supplements(delegate, decompositions, None, tables, None, 0xC0)
Self::new_with_supplements(
delegate,
decompositions,
None,
tables,
None,
0xC0,
IgnorableBehavior::Unsupported,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: why is the default value for behavior of handling an ignored character not to ignore it (IgnorableBehavior::Ignored)?

IgnorableBehavior::Ignored is what UTS 46 would do by default, right?

Or do you default to Unsupported because this is in the underlying Decomposition struct, which can be used independently of UTS 46, and thus would want to be kept separate by default unless otherwise specified?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The underlying struct can be used for non-UTS46 purposes: the usual normalizations. For those, we expose contiguous-buffer APIs, and the contiguous-buffer versions currently can't deal with ignorables, and since there is currently no use case for them to deal with ignorables, I think it's better to have a state that they can debug_assert! against than to add support for ignorables is the contiguous-buffer entry points.

)
}

/// Constructs a decomposing iterator adapter from a delegate
Expand All @@ -565,6 +587,7 @@ where
tables: &'data DecompositionTablesV1,
supplementary_tables: Option<&'data DecompositionTablesV1>,
decomposition_passthrough_bound: u8,
ignorable_behavior: IgnorableBehavior,
) -> Self {
let half_width_voicing_marks_become_non_starters =
if let Some(supplementary) = supplementary_decompositions {
Expand Down Expand Up @@ -595,6 +618,7 @@ where
},
half_width_voicing_marks_become_non_starters,
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
ignorable_behavior,
};
let _ = ret.next(); // Remove the U+FFFF placeholder
ret
Expand Down Expand Up @@ -721,16 +745,42 @@ where

fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
debug_assert!(self.pending.is_none());
let c = self.delegate.next()?;
loop {
let c = self.delegate.next()?;

// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}
// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}

Some(self.attach_trie_value(c))
if let Some(supplementary) = self.supplementary_trie {
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
if value.trie_val == IGNORABLE_MARKER {
match self.ignorable_behavior {
IgnorableBehavior::Unsupported => {
debug_assert!(false);
}
IgnorableBehavior::ReplacementCharacter => {
return Some(CharacterAndTrieValue::new(
c,
u32::from(REPLACEMENT_CHARACTER),
));
}
IgnorableBehavior::Ignored => {
// Else ignore this character by reading the next one from the delegate.
continue;
}
}
}
return Some(value);
}
}
let trie_val = self.trie.get(c);
debug_assert_ne!(trie_val, IGNORABLE_MARKER);
return Some(CharacterAndTrieValue::new(c, trie_val));
}
}

fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
Expand Down Expand Up @@ -1229,6 +1279,7 @@ macro_rules! composing_normalize_to {
) -> core::fmt::Result {
$prolog
let mut $composition = self.normalize_iter($text.chars());
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
Expand Down Expand Up @@ -1416,6 +1467,7 @@ macro_rules! decomposing_normalize_to {
$prolog

let mut $decomposition = self.normalize_iter($text.chars());
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);

// Try to get the compiler to hoist the bound to a register.
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
Expand Down Expand Up @@ -1730,8 +1782,8 @@ impl DecomposingNormalizer {
}

#[doc(hidden)]
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_decomposed_without_ignored_and_disallowed() -> Self {
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46_decomposed() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
Expand Down Expand Up @@ -1807,8 +1859,7 @@ impl DecomposingNormalizer {
///
/// Public for testing only.
#[doc(hidden)]
#[cfg(feature = "experimental")]
pub fn try_new_uts46_decomposed_without_ignored_and_disallowed_unstable<D>(
pub(crate) fn try_new_uts46_decomposed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
where
Expand Down Expand Up @@ -1872,6 +1923,7 @@ impl DecomposingNormalizer {
self.tables.get(),
self.supplementary_tables.as_ref().map(|s| s.get()),
self.decomposition_passthrough_bound,
IgnorableBehavior::Unsupported,
)
}

Expand Down Expand Up @@ -2241,52 +2293,27 @@ impl ComposingNormalizer {
})
}

/// See [`Self::try_new_uts46_without_ignored_and_disallowed_unstable`].
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_without_ignored_and_disallowed() -> Self {
ComposingNormalizer {
decomposing_normalizer:
DecomposingNormalizer::new_uts46_decomposed_without_ignored_and_disallowed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

/// 🚧 \[Experimental\] UTS 46 constructor
///
/// This is a special building block normalization for IDNA that implements parts of the Map
/// step and the following Normalize step. The caller is responsible for performing the
/// "disallowed", "ignored", and "deviation" parts of the Map step before passing data to
/// this normalizer such that disallowed and ignored characters aren't passed to this
/// normalizer.
///
/// This is ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows
/// and ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as
/// in NFC in this normalization. Making the disallowed characters behave like this is beneficial
/// to data size, and this normalizer implementation cannot deal with a character normalizing
/// to the empty string, which doesn't happen in NFC or NFKC as of Unicode 14.
/// step and the following Normalize step.
///
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
/// Therefore, the output of this normalization may differ for different inputs that are
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
/// to other reorderable characters.
///
/// NOTE: This method remains experimental until suitability of this feature as part of
/// IDNA processing has been demonstrated.
///
/// <div class="stab unstable">
/// 🚧 This code is experimental; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. It can be enabled with the "experimental" Cargo feature
/// of the icu meta-crate. Use with caution.
/// <a href="https://github.com/unicode-org/icu4x/issues/2614">#2614</a>
/// </div>
#[cfg(feature = "experimental")]
pub fn try_new_uts46_without_ignored_and_disallowed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
Expand All @@ -2297,9 +2324,7 @@ impl ComposingNormalizer {
+ ?Sized,
{
let decomposing_normalizer =
DecomposingNormalizer::try_new_uts46_decomposed_without_ignored_and_disallowed_unstable(
provider,
)?;
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;

let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Expand All @@ -2313,6 +2338,14 @@ impl ComposingNormalizer {
/// Wraps a delegate iterator into a composing iterator
/// adapter by using the data already held by this normalizer.
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
}

fn normalize_iter_private<I: Iterator<Item = char>>(
&self,
iter: I,
ignorable_behavior: IgnorableBehavior,
) -> Composition<I> {
Composition::new(
Decomposition::new_with_supplements(
iter,
Expand All @@ -2327,6 +2360,7 @@ impl ComposingNormalizer {
.as_ref()
.map(|s| s.get()),
self.decomposing_normalizer.decomposition_passthrough_bound,
ignorable_behavior,
),
ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
self.decomposing_normalizer.composition_passthrough_bound,
Expand Down
Loading
Loading