Skip to content

Commit

Permalink
Split numbering systems out of decimal data (#5822)
Browse files Browse the repository at this point in the history
Fixes #5818


Before:

```
 decimal/symbols@2, <lookup>, 1316B, 252 identifiers
 decimal/symbols@2, <total>, 4308B, 2436B, 49 unique payloads
```

After:


```
 decimal/digits@1, <lookup>, 207B, 27 identifiers
 decimal/digits@1, <total>, 1080B, 1060B, 27 unique payloads
 decimal/symbols@2, <lookup>, 804B, 184 identifiers
 decimal/symbols@2, <total>, 1749B, 881B, 31 unique payloads

```

Saving ~1.5kB, a good third of the data size. A lot of the wins are just
in deduplication.

I'm also going to try moving the tinystr into the VarZeroVec and seeing
what happens.

I may also try and store the digits more compactly as an `enum {
Sequential(char), Many(ZeroVec<char>) }`. A downside of this is that the
Sequential case would need UTF8 validation every time, though we could
make it so that that's just the wire format and we expand to a digit
array on data load.


Todo: add configurability for this.

@sffc In the long run CompactDecimal / etc should also be using this
data. In that case, should we just always generate all known decimal
systems? How would the unification work across keys?
  • Loading branch information
Manishearth authored Nov 15, 2024
1 parent 36d97cd commit 53eac4d
Show file tree
Hide file tree
Showing 62 changed files with 541 additions and 416 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion components/datetime/src/external_loaders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ pub(crate) struct ExternalLoaderUnstable<'a, P: ?Sized>(pub &'a P);

impl<P> FixedDecimalFormatterLoader for ExternalLoaderUnstable<'_, P>
where
P: ?Sized + DataProvider<icu_decimal::provider::DecimalSymbolsV2Marker>,
P: ?Sized
+ DataProvider<icu_decimal::provider::DecimalSymbolsV2Marker>
+ DataProvider<icu_decimal::provider::DecimalDigitsV1Marker>,
{
#[inline]
fn load(
Expand Down
9 changes: 5 additions & 4 deletions components/datetime/src/format/neo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use icu_calendar::types::FormattingEra;
use icu_calendar::types::MonthCode;
use icu_decimal::options::FixedDecimalFormatterOptions;
use icu_decimal::options::GroupingStrategy;
use icu_decimal::provider::DecimalSymbolsV2Marker;
use icu_decimal::provider::{DecimalDigitsV1Marker, DecimalSymbolsV2Marker};
use icu_decimal::FixedDecimalFormatter;
use icu_provider::marker::NeverMarker;
use icu_provider::prelude::*;
Expand Down Expand Up @@ -262,7 +262,7 @@ where
size_test!(
TypedDateTimeNames<icu_calendar::Gregorian, DateTimeMarker>,
typed_date_time_names_size,
336
352
);

/// A low-level type that formats datetime patterns with localized names.
Expand Down Expand Up @@ -625,7 +625,7 @@ impl<C: CldrCalendar, R: DateTimeNamesMarker> TypedDateTimeNames<C, R> {
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<P>(provider: &P, locale: &DataLocale) -> Result<Self, DataError>
where
P: DataProvider<DecimalSymbolsV2Marker> + ?Sized,
P: DataProvider<DecimalSymbolsV2Marker> + DataProvider<DecimalDigitsV1Marker> + ?Sized,
{
let mut names = Self {
locale: locale.clone(),
Expand Down Expand Up @@ -1418,7 +1418,7 @@ impl<C: CldrCalendar, R: DateTimeNamesMarker> TypedDateTimeNames<C, R> {
#[inline]
pub fn load_fixed_decimal_formatter<P>(&mut self, provider: &P) -> Result<&mut Self, DataError>
where
P: DataProvider<DecimalSymbolsV2Marker> + ?Sized,
P: DataProvider<DecimalSymbolsV2Marker> + DataProvider<DecimalDigitsV1Marker> + ?Sized,
{
self.inner
.load_fixed_decimal_formatter(&ExternalLoaderUnstable(provider), &self.locale)?;
Expand Down Expand Up @@ -1498,6 +1498,7 @@ impl<C: CldrCalendar, R: DateTimeNamesMarker> TypedDateTimeNames<C, R> {
+ DataProvider<tz::MzSpecificShortV1Marker>
+ DataProvider<tz::MzPeriodV1Marker>
+ DataProvider<DecimalSymbolsV2Marker>
+ DataProvider<DecimalDigitsV1Marker>
+ ?Sized,
{
let locale = &self.locale;
Expand Down
4 changes: 2 additions & 2 deletions components/datetime/src/neo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ macro_rules! gen_any_buffer_constructors_with_external_loader {
// }
// }

size_test!(FixedCalendarDateTimeFormatter<icu_calendar::Gregorian, crate::fieldset::YMD>, typed_neo_year_month_day_formatter_size, 336);
size_test!(FixedCalendarDateTimeFormatter<icu_calendar::Gregorian, crate::fieldset::YMD>, typed_neo_year_month_day_formatter_size, 352);

/// [`FixedCalendarDateTimeFormatter`] is a formatter capable of formatting dates and/or times from
/// a calendar selected at compile time.
Expand Down Expand Up @@ -353,7 +353,7 @@ where
size_test!(
DateTimeFormatter<crate::fieldset::YMD>,
neo_year_month_day_formatter_size,
392
408
);

/// [`DateTimeFormatter`] is a formatter capable of formatting dates and/or times from
Expand Down
11 changes: 8 additions & 3 deletions components/datetime/src/scaffold/fieldset_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use icu_calendar::{
},
Date, Iso, Time,
};
use icu_decimal::provider::DecimalSymbolsV2Marker;
use icu_decimal::provider::{DecimalDigitsV1Marker, DecimalSymbolsV2Marker};
use icu_provider::{marker::NeverMarker, prelude::*};
use icu_timezone::scaffold::IntoOption;
use icu_timezone::{TimeZoneBcp47Id, UtcOffset, ZoneVariant};
Expand Down Expand Up @@ -368,10 +368,13 @@ where

/// Trait to consolidate data provider markers external to this crate
/// for datetime formatting with a fixed calendar.
pub trait AllFixedCalendarExternalDataMarkers: DataProvider<DecimalSymbolsV2Marker> {}
pub trait AllFixedCalendarExternalDataMarkers:
DataProvider<DecimalSymbolsV2Marker> + DataProvider<DecimalDigitsV1Marker>
{
}

impl<T> AllFixedCalendarExternalDataMarkers for T where
T: ?Sized + DataProvider<DecimalSymbolsV2Marker>
T: ?Sized + DataProvider<DecimalSymbolsV2Marker> + DataProvider<DecimalDigitsV1Marker>
{
}

Expand All @@ -385,6 +388,7 @@ pub trait AllAnyCalendarExternalDataMarkers:
+ DataProvider<JapaneseErasV1Marker>
+ DataProvider<JapaneseExtendedErasV1Marker>
+ DataProvider<DecimalSymbolsV2Marker>
+ DataProvider<DecimalDigitsV1Marker>
{
}

Expand All @@ -397,6 +401,7 @@ impl<T> AllAnyCalendarExternalDataMarkers for T where
+ DataProvider<JapaneseErasV1Marker>
+ DataProvider<JapaneseExtendedErasV1Marker>
+ DataProvider<DecimalSymbolsV2Marker>
+ DataProvider<DecimalDigitsV1Marker>
{
}

Expand Down
5 changes: 3 additions & 2 deletions components/decimal/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ databake = { workspace = true, features = ["derive"], optional = true}
serde = { workspace = true, features = ["derive", "alloc"], optional = true }

icu_decimal_data = { workspace = true, optional = true }
tinystr = { workspace = true }

[dev-dependencies]
icu = { path = "../../components/icu", default-features = false }
Expand All @@ -46,8 +47,8 @@ criterion = { workspace = true }
[features]
default = ["compiled_data"]
std = ["fixed_decimal/std", "icu_locale_core/std", "icu_provider/std"]
serde = ["dep:serde", "icu_provider/serde", "zerovec/serde"]
datagen = ["serde", "dep:databake", "zerovec/databake"]
serde = ["dep:serde", "icu_provider/serde", "zerovec/serde", "tinystr/serde"]
datagen = ["serde", "dep:databake", "zerovec/databake", "tinystr/databake"]
bench = ["serde"]
compiled_data = ["dep:icu_decimal_data"]

Expand Down
23 changes: 5 additions & 18 deletions components/decimal/benches/fixed_decimal_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@ use rand_pcg::Lcg64Xsh32;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

use fixed_decimal::FixedDecimal;
use icu_decimal::provider::{Baked, DecimalSymbolsV2Marker};
use icu_decimal::FixedDecimalFormatter;
use icu_decimal::{FixedDecimalFormatter, FixedDecimalFormatterPreferences};
use icu_locale_core::locale;
use icu_provider::prelude::*;
use icu_provider_adapters::fixed::FixedProvider;

fn triangular_nums(range: f64) -> Vec<isize> {
// Use Lcg64Xsh32, a small, fast PRNG.
Expand All @@ -28,24 +25,14 @@ fn triangular_nums(range: f64) -> Vec<isize> {

fn overview_bench(c: &mut Criterion) {
let nums = triangular_nums(1e9);
let data = Baked
.load(DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale!("en-US").into()),
..Default::default()
})
.unwrap()
.payload;
let provider = FixedProvider::<DecimalSymbolsV2Marker>::from_payload(data);
let locale = locale!("en-US");
let prefs = FixedDecimalFormatterPreferences::from(&locale);
let options = Default::default();
c.bench_function("icu_decimal/overview", |b| {
b.iter(|| {
// This benchmark demonstrates the performance of the format function on 1000 numbers
// ranging from -1e9 to 1e9.
let fdf = FixedDecimalFormatter::try_new_unstable(
&provider,
Default::default(),
Default::default(),
)
.unwrap();
let fdf = FixedDecimalFormatter::try_new(prefs.clone(), options).unwrap();
for &num in &nums {
let fd = FixedDecimal::from(black_box(num));
fdf.format_to_string(&fd);
Expand Down
3 changes: 2 additions & 1 deletion components/decimal/src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct FormattedFixedDecimal<'l> {
pub(crate) value: &'l FixedDecimal,
pub(crate) options: &'l FixedDecimalFormatterOptions,
pub(crate) symbols: &'l DecimalSymbolsV2<'l>,
pub(crate) digits: &'l DecimalDigitsV1,
}

impl FormattedFixedDecimal<'_> {
Expand Down Expand Up @@ -47,7 +48,7 @@ impl Writeable for FormattedFixedDecimal<'_> {
sink.write_str(self.symbols.decimal_separator())?;
}
#[allow(clippy::indexing_slicing)] // digit_at in 0..=9
sink.write_char(self.symbols.digits[self.value.digit_at(m) as usize])?;
sink.write_char(self.digits.digits[self.value.digit_at(m) as usize])?;
if grouper::check(
upper_magnitude,
m,
Expand Down
9 changes: 8 additions & 1 deletion components/decimal/src/grouper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ fn test_grouper() {
use fixed_decimal::FixedDecimal;
use icu_provider::prelude::*;
use icu_provider_adapters::fixed::FixedProvider;
use icu_provider_adapters::fork::ForkByMarkerProvider;
use writeable::assert_writeable_eq;

let western_sizes = GroupingSizesV1 {
Expand Down Expand Up @@ -154,12 +155,18 @@ fn test_grouper() {
for cas in &cases {
for i in 0..4 {
let dec = FixedDecimal::from(1).multiplied_pow10((i as i16) + 3);
let provider = FixedProvider::<DecimalSymbolsV2Marker>::from_owned(
let provider_symbols = FixedProvider::<DecimalSymbolsV2Marker>::from_owned(
crate::provider::DecimalSymbolsV2 {
grouping_sizes: cas.sizes,
..DecimalSymbolsV2::new_en_for_testing()
},
);
let provider_digits = FixedProvider::<DecimalDigitsV1Marker>::from_owned(
crate::provider::DecimalDigitsV1 {
digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
},
);
let provider = ForkByMarkerProvider::new(provider_symbols, provider_digits);
let options = options::FixedDecimalFormatterOptions {
grouping_strategy: cas.strategy,
..Default::default()
Expand Down
30 changes: 26 additions & 4 deletions components/decimal/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,15 @@ pub use format::FormattedFixedDecimal;

use alloc::string::String;
use fixed_decimal::FixedDecimal;
use icu_locale_core::locale;
use icu_locale_core::preferences::{
define_preferences, extensions::unicode::keywords::NumberingSystem,
};
use icu_provider::prelude::*;
use size_test_macro::size_test;
use writeable::Writeable;

size_test!(FixedDecimalFormatter, fixed_decimal_formatter_size, 88);
size_test!(FixedDecimalFormatter, fixed_decimal_formatter_size, 104);

define_preferences!(
/// The preferences for fixed decimal formatting.
Expand Down Expand Up @@ -135,6 +136,7 @@ define_preferences!(
pub struct FixedDecimalFormatter {
options: options::FixedDecimalFormatterOptions,
symbols: DataPayload<provider::DecimalSymbolsV2Marker>,
digits: DataPayload<provider::DecimalDigitsV1Marker>,
}

impl AsRef<FixedDecimalFormatter> for FixedDecimalFormatter {
Expand All @@ -150,7 +152,11 @@ impl FixedDecimalFormatter {
);

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D: DataProvider<provider::DecimalSymbolsV2Marker> + ?Sized>(
pub fn try_new_unstable<
D: DataProvider<provider::DecimalSymbolsV2Marker>
+ DataProvider<provider::DecimalDigitsV1Marker>
+ ?Sized,
>(
provider: &D,
prefs: FixedDecimalFormatterPreferences,
options: options::FixedDecimalFormatterOptions,
Expand All @@ -163,7 +169,7 @@ impl FixedDecimalFormatter {
.as_ref()
.map(|s| s.as_str())
.unwrap_or("");
let symbols = provider
let symbols: DataPayload<provider::DecimalSymbolsV2Marker> = provider
.load(DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
DataMarkerAttributes::from_str_or_panic(nu),
Expand All @@ -172,7 +178,22 @@ impl FixedDecimalFormatter {
..Default::default()
})?
.payload;
Ok(Self { options, symbols })

let digits = provider
.load(DataRequest {
id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
DataMarkerAttributes::from_str_or_panic(&symbols.get().numsys),
&locale!("und").into(),
),
..Default::default()
})?
.payload;

Ok(Self {
options,
symbols,
digits,
})
}

/// Formats a [`FixedDecimal`], returning a [`FormattedFixedDecimal`].
Expand All @@ -181,6 +202,7 @@ impl FixedDecimalFormatter {
value,
options: &self.options,
symbols: self.symbols.get(),
digits: self.digits.get(),
}
}

Expand Down
35 changes: 32 additions & 3 deletions components/decimal/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

use alloc::borrow::Cow;
use icu_provider::prelude::*;
use tinystr::TinyStr8;
use zerovec::VarZeroCow;

#[cfg(feature = "compiled_data")]
Expand All @@ -41,11 +42,12 @@ const _: () = {
}
make_provider!(Baked);
impl_decimal_symbols_v2_marker!(Baked);
impl_decimal_digits_v1_marker!(Baked);
};

#[cfg(feature = "datagen")]
/// The latest minimum set of markers required by this component.
pub const MARKERS: &[DataMarkerInfo] = &[DecimalSymbolsV2Marker::INFO];
pub const MARKERS: &[DataMarkerInfo] = &[DecimalSymbolsV2Marker::INFO, DecimalDigitsV1Marker::INFO];

/// A collection of settings expressing where to put grouping separators in a decimal number.
/// For example, `1,000,000` has two grouping separators, positioned along every 3 digits.
Expand Down Expand Up @@ -75,7 +77,9 @@ pub struct GroupingSizesV1 {
pub min_grouping: u8,
}

/// The strings used in DecimalSymbolsV2
/// A stack representation of the strings used in [`DecimalSymbolsV2`], i.e. a builder type
/// for [`DecimalSymbolsStrs`]. This type can be obtained from a [`DecimalSymbolsStrs`]
/// the `From`/`Into` traits.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
Expand Down Expand Up @@ -114,6 +118,13 @@ pub struct DecimalSymbolStrsBuilder<'data> {
pub grouping_separator: Cow<'data, str>,
}

impl<'data> DecimalSymbolStrsBuilder<'data> {
/// Build a [`DecimalSymbolsStrs`]
pub fn build(&self) -> VarZeroCow<'static, DecimalSymbolsStrs> {
VarZeroCow::from_encodeable(self)
}
}

/// Symbols and metadata required for formatting a [`FixedDecimal`](crate::FixedDecimal).
///
/// <div class="stab unstable">
Expand All @@ -136,6 +147,24 @@ pub struct DecimalSymbolsV2<'data> {
/// Settings used to determine where to place groups in the integer part of the number.
pub grouping_sizes: GroupingSizesV1,

/// The numbering system to use.
pub numsys: TinyStr8,
}

/// The digits for a given numbering system. This data ought to be stored in the `und` locale with an auxiliary key
/// set to the numbering system code.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(DecimalDigitsV1Marker = "decimal/digits@1")]
#[derive(Debug, PartialEq, Clone, Copy)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_decimal::provider))]
pub struct DecimalDigitsV1 {
/// Digit characters for the current numbering system. In most systems, these digits are
/// contiguous, but in some systems, such as *hanidec*, they are not contiguous.
pub digits: [char; 10],
Expand Down Expand Up @@ -185,7 +214,7 @@ impl DecimalSymbolsV2<'static> {
secondary: 3,
min_grouping: 1,
},
digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
numsys: tinystr::tinystr!(8, "latn"),
}
}
}
Loading

0 comments on commit 53eac4d

Please sign in to comment.