Skip to content

Commit

Permalink
Move baked data lookup size calculations to its exporter (#5161)
Browse files Browse the repository at this point in the history
This is more accurate, as the previous code needed to make some educated
guesses. Needed for zerotrie where we can't guess.

Based on #5169
  • Loading branch information
robertbastian authored Jul 3, 2024
1 parent 3d0bd17 commit fbff129
Show file tree
Hide file tree
Showing 462 changed files with 5,244 additions and 2,784 deletions.
80 changes: 51 additions & 29 deletions provider/baked/src/binary_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,64 +9,86 @@ use databake::*;
use icu_provider::prelude::*;

#[cfg(feature = "export")]
pub fn bake(
pub(crate) fn bake(
marker_bake: &TokenStream,
reqs_to_idents: Vec<(DataIdentifierCow, proc_macro2::Ident)>,
mut ids_to_idents: Vec<(DataIdentifierCow, proc_macro2::Ident)>,
idents_to_bakes: Vec<(proc_macro2::Ident, TokenStream)>,
) -> TokenStream {
let mut ids_to_idents = reqs_to_idents
.into_iter()
.map(|(id, ident)| {
(
(id.marker_attributes.to_string(), id.locale.to_string()),
quote!(#ident),
)
})
.collect::<Vec<_>>();

ids_to_idents.sort_by(|(a, _), (b, _)| a.cmp(b));
) -> (TokenStream, usize) {
let mut size = 0;

let idents_to_bakes = idents_to_bakes.into_iter().map(|(ident, bake)| {
quote! {
const #ident: &S = &#bake;
}
// Data.0 is a fat pointer
size += core::mem::size_of::<&[()]>();

// The idents are references
size += ids_to_idents.len() * core::mem::size_of::<&()>();

ids_to_idents.sort_by_cached_key(|(id, _)| {
(
id.marker_attributes.as_str().to_string(),
id.locale.to_string(),
)
});

let (ty, reqs_to_idents) = if ids_to_idents.iter().all(|((a, _), _)| a.is_empty()) {
let (ty, id_bakes_to_idents) = if ids_to_idents
.iter()
.all(|(id, _)| id.marker_attributes.is_empty())
{
// Only DataLocales
size += ids_to_idents.len() * core::mem::size_of::<&str>();
(
quote! { icu_provider_baked::binary_search::Locale },
ids_to_idents
.iter()
.map(|((_, l), i)| quote!((#l, #i)))
.map(|(id, ident)| {
let k = id.locale.to_string();
quote!((#k, #ident))
})
.collect::<Vec<_>>(),
)
} else if ids_to_idents.iter().all(|((_, l), _)| *l == "und") {
} else if ids_to_idents.iter().all(|(id, _)| id.locale.is_und()) {
// Only marker attributes
size += ids_to_idents.len() * core::mem::size_of::<&str>();
(
quote! { icu_provider_baked::binary_search::Attributes },
ids_to_idents
.iter()
.map(|((a, _), i)| quote!((#a, #i)))
.map(|(id, ident)| {
let k = id.marker_attributes.as_str();
quote!((#k, #ident))
})
.collect(),
)
} else {
size += ids_to_idents.len() * 2 * core::mem::size_of::<&str>();
(
quote! { icu_provider_baked::binary_search::AttributesAndLocale },
ids_to_idents
.iter()
.map(|((a, l), i)| quote!(((#a, #l), #i)))
.map(|(id, ident)| {
let k0 = id.marker_attributes.as_str();
let k1 = id.locale.to_string();
quote!(((#k0, #k1), #ident))
})
.collect(),
)
};

quote! {
icu_provider_baked::binary_search::Data<#ty, #marker_bake> = {
type S = <#marker_bake as icu_provider::DynamicDataMarker>::Yokeable;
#(#idents_to_bakes)*
icu_provider_baked::binary_search::Data(&[#(#reqs_to_idents,)*])
let idents_to_bakes = idents_to_bakes.into_iter().map(|(ident, bake)| {
quote! {
const #ident: &S = &#bake;
}
}
});

(
quote! {
icu_provider_baked::binary_search::Data<#ty, #marker_bake> = {
type S = <#marker_bake as icu_provider::DynamicDataMarker>::Yokeable;
#(#idents_to_bakes)*
icu_provider_baked::binary_search::Data(&[#(#id_bakes_to_idents,)*])
}
},
size,
)
}

pub struct Data<K: BinarySearchKey, M: DataMarker>(pub &'static [(K::Type, &'static M::Yokeable)]);
Expand Down
128 changes: 99 additions & 29 deletions provider/baked/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ use icu_provider::export::*;
use icu_provider::prelude::*;
use std::collections::HashSet;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::fmt::Write as _;
use std::fs::File;
use std::io::Write;
use std::path::Path;
Expand Down Expand Up @@ -162,12 +163,20 @@ pub struct BakedExporter {
HashMap<DataPayload<ExportMarker>, HashSet<DataIdentifierCow<'static>>>,
>,
>,
/// (marker, file name) pairs to wire up in mod.rs. This is populated by `flush` and consumed by `close`.
impl_data: Mutex<BTreeMap<DataMarkerInfo, SyncTokenStream>>,
/// file names and statistics to be consumed by `close`.
impl_data: Mutex<BTreeMap<DataMarkerInfo, (SyncTokenStream, Statistics)>>,
// List of dependencies used by baking.
dependencies: CrateEnv,
}

#[derive(Default)]
pub struct Statistics {
pub structs_total_size: usize,
pub structs_count: usize,
pub lookup_struct_size: usize,
pub identifiers_count: usize,
}

impl std::fmt::Debug for BakedExporter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BakedExporter")
Expand Down Expand Up @@ -248,8 +257,9 @@ impl BakedExporter {
};

if !self.use_separate_crates {
// Don't search the whole file, there should be a macro in the first 300 bytes
if formatted[..300].contains("macro_rules!") || formatted[..100].contains("include!") {
// Don't search the whole file, there should be a macro in the first 1000 bytes
if formatted[..1000].contains("macro_rules!") || formatted[..1000].contains("include!")
{
// Formatted, otherwise it'd be `macro_rules !`
formatted = formatted
.replace("icu_", "icu::")
Expand Down Expand Up @@ -298,16 +308,43 @@ impl BakedExporter {
fn write_impl_macros(
&self,
marker: DataMarkerInfo,
stats: Statistics,
body: TokenStream,
iterable_body: TokenStream,
) -> Result<(), DataError> {
let marker_unqualified = bake_marker(marker).into_iter().last().unwrap().to_string();

let doc = format!(
" Implement `DataProvider<{}>` on the given struct using the data",
marker_unqualified
let &Statistics {
structs_total_size,
structs_count,
lookup_struct_size,
identifiers_count,
} = &stats;

let mut doc = format!(
" Implement `DataProvider<{marker_unqualified}>` on the given struct using the data\n \
hardcoded in this file. This allows the struct to be used with\n \
`icu`'s `_unstable` constructors."
);

if structs_count > 0 {
let _infallible = write!(&mut doc, "\n\n Using this implementation will embed the following data in the binary's data segment:\n ");

if marker.is_singleton {
let _infallible = write!(
&mut doc,
"* {structs_total_size}B[^1] for the singleton data struct\n "
);
} else {
let _infallible = write!(&mut doc, "* {lookup_struct_size}B[^1] for the lookup data structure ({identifiers_count} data identifiers)\n ");
let _infallible = write!(&mut doc, "* {structs_total_size}B[^1] for the actual data ({structs_count} unique structs)\n ");
};
let _infallible = write!(
&mut doc,
"\n [^1]: these numbers can be smaller in practice due to linker deduplication"
);
}

let ident = marker_unqualified.to_snake_case();

let macro_ident = format!("impl_{ident}",).parse::<TokenStream>().unwrap();
Expand All @@ -323,8 +360,6 @@ impl BakedExporter {
Path::new(&format!("{ident}.rs.data")),
quote! {
#[doc = #doc]
/// hardcoded in this file. This allows the struct to be used with
/// `icu`'s `_unstable` constructors.
#[doc(hidden)] // macro
#[macro_export]
macro_rules! #prefixed_macro_ident {
Expand All @@ -343,7 +378,10 @@ impl BakedExporter {
},
)?;

self.impl_data.lock().expect("poison").insert(marker, ident);
self.impl_data
.lock()
.expect("poison")
.insert(marker, (ident, stats));
Ok(())
}
}
Expand Down Expand Up @@ -390,7 +428,14 @@ impl DataExporter for BakedExporter {

let bake = payload.tokenize(&self.dependencies);

self.write_impl_macros(marker, quote! {
let stats = Statistics {
structs_total_size: payload.baked_size(),
structs_count: 1,
identifiers_count: 1,
lookup_struct_size: 0,
};

self.write_impl_macros(marker, stats, quote! {
#maybe_msrv
impl $provider {
// Exposing singleton structs as consts allows us to get rid of fallibility
Expand Down Expand Up @@ -439,6 +484,7 @@ impl DataExporter for BakedExporter {
if deduplicated_values.is_empty() {
self.write_impl_macros(
marker,
Default::default(),
quote! {
#maybe_msrv
impl icu_provider::DataProvider<#marker_bake> for $provider {
Expand All @@ -461,27 +507,33 @@ impl DataExporter for BakedExporter {
)
} else {
let mut idents_to_bakes = Vec::new();
let mut stats = Statistics::default();

let ids_to_idents = deduplicated_values
.iter()
.flat_map(|(payload, ids)| {
let ident = ids
let min_id = ids
.iter()
.map(|id| {
format!("_{}_{}", id.marker_attributes.as_str(), id.locale)
.chars()
.map(|ch| {
if ch == '-' {
'_'
} else {
ch.to_ascii_uppercase()
}
})
.collect::<String>()
})
.min()
.min_by_key(|id| (id.marker_attributes.as_str(), id.locale.to_string()))
.unwrap();
let ident = proc_macro2::Ident::new(&ident, proc_macro2::Span::call_site());

let ident = proc_macro2::Ident::new(
&format!("_{}_{}", min_id.marker_attributes.as_str(), min_id.locale)
.chars()
.map(|ch| {
if ch == '-' {
'_'
} else {
ch.to_ascii_uppercase()
}
})
.collect::<String>(),
proc_macro2::Span::call_site(),
);

stats.structs_count += 1;
stats.identifiers_count += ids.len();
stats.structs_total_size += payload.baked_size();

idents_to_bakes.push((ident.clone(), payload.tokenize(&self.dependencies)));
ids.iter().map(move |id| (id.clone(), ident.clone()))
Expand All @@ -503,7 +555,10 @@ impl DataExporter for BakedExporter {
.parse::<TokenStream>()
.unwrap();

let data = crate::binary_search::bake(&marker_bake, ids_to_idents, idents_to_bakes);
let (data, lookup_struct_size) =
crate::binary_search::bake(&marker_bake, ids_to_idents, idents_to_bakes);

stats.lookup_struct_size = lookup_struct_size;

let search = if !self.use_internal_fallback
|| deduplicated_values
Expand Down Expand Up @@ -544,6 +599,7 @@ impl DataExporter for BakedExporter {

self.write_impl_macros(
marker,
stats,
quote! {
#maybe_msrv
impl $provider {
Expand Down Expand Up @@ -588,11 +644,11 @@ impl DataExporter for BakedExporter {

let marker_bakes = data.keys().copied().map(bake_marker);

let file_paths = data.values().map(|i| format!("{i}.rs.data"));
let file_paths = data.values().map(|(i, _)| format!("{i}.rs.data"));

let macro_idents = data
.values()
.map(|i| format!("impl_{i}").parse::<TokenStream>().unwrap());
.map(|(i, _)| format!("impl_{i}").parse::<TokenStream>().unwrap());

// mod.rs is the interface for built-in data. It exposes one macro per marker.
self.write_to_file(
Expand Down Expand Up @@ -660,6 +716,20 @@ impl DataExporter for BakedExporter {
},
)?;

// TODO: Return the statistics instead of writing them out.
let mut file = crlify::BufWriterWithLineEndingFix::new(std::fs::File::create(
self.mod_directory.join("fingerprints.csv"),
)?);
for (marker, (_, stats)) in data {
if !marker.is_singleton {
writeln!(
&mut file,
"{marker:?}, <lookup>, {}B, {} identifiers",
stats.lookup_struct_size, stats.identifiers_count
)?;
}
}

self.print_deps();

Ok(())
Expand Down
1 change: 1 addition & 0 deletions provider/baked/tests/data/fingerprints.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
core/helloworld@1, <lookup>, 1096B, 27 identifiers
6 changes: 6 additions & 0 deletions provider/baked/tests/data/hello_world_v1_marker.rs.data
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
/// Implement `DataProvider<HelloWorldV1Marker>` on the given struct using the data
/// hardcoded in this file. This allows the struct to be used with
/// `icu`'s `_unstable` constructors.
///
/// Using this implementation will embed the following data in the binary's data segment:
/// * 1096B[^1] for the lookup data structure (27 data identifiers)
/// * 1100B[^1] for the actual data (27 unique structs)
///
/// [^1]: these numbers can be smaller in practice due to linker deduplication
#[doc(hidden)]
#[macro_export]
macro_rules! __impl_hello_world_v1_marker {
Expand Down
5 changes: 5 additions & 0 deletions provider/data/calendar/data/chinese_cache_v1_marker.rs.data

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions provider/data/calendar/data/dangi_cache_v1_marker.rs.data

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit fbff129

Please sign in to comment.