Skip to content

Commit fa8762b

Browse files
committed
Auto merge of #112448 - nnethercote:no-tiny-cgus, r=wesleywiser
Introduce a minimum CGU size in non-incremental builds. Because tiny CGUs slow down compilation *and* result in worse generated code. r? `@wesleywiser`
2 parents 6330daa + 7c3ce02 commit fa8762b

File tree

5 files changed

+201
-46
lines changed

5 files changed

+201
-46
lines changed

compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
13851385
let trait_def_id = trait_ref_self.def_id();
13861386
let trait_vis = cx.tcx.visibility(trait_def_id);
13871387

1388-
let cgus = cx.sess().codegen_units();
1388+
let cgus = cx.sess().codegen_units().as_usize();
13891389
let single_cgu = cgus == 1;
13901390

13911391
let lto = cx.sess().lto();

compiler/rustc_codegen_ssa/src/back/write.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
646646
// rlib.
647647
let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);
648648

649-
let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
649+
let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;
650650

651651
let keep_numbered_objects =
652-
needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
652+
needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);
653653

654654
for module in compiled_modules.modules.iter() {
655655
if let Some(ref path) = module.object {
@@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
19231923

19241924
// FIXME: time_llvm_passes support - does this use a global context or
19251925
// something?
1926-
if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
1926+
if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
19271927
self.backend.print_pass_timings()
19281928
}
19291929

compiler/rustc_monomorphize/src/partitioning.rs

+168-34
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
113113
use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
114114
use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
115115
use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
116+
use rustc_session::CodegenUnits;
116117
use rustc_span::symbol::Symbol;
117118

118119
use crate::collector::UsageMap;
@@ -121,7 +122,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec
121122

122123
struct PartitioningCx<'a, 'tcx> {
123124
tcx: TyCtxt<'tcx>,
124-
target_cgu_count: usize,
125125
usage_map: &'a UsageMap<'tcx>,
126126
}
127127

@@ -130,26 +130,30 @@ struct PlacedRootMonoItems<'tcx> {
130130
codegen_units: Vec<CodegenUnit<'tcx>>,
131131

132132
internalization_candidates: FxHashSet<MonoItem<'tcx>>,
133+
134+
/// These must be obtained when the iterator in `partition` runs. They
135+
/// can't be obtained later because some inlined functions might not be
136+
/// reachable.
137+
unique_inlined_stats: (usize, usize),
133138
}
134139

135140
// The output CGUs are sorted by name.
136141
fn partition<'tcx, I>(
137142
tcx: TyCtxt<'tcx>,
138143
mono_items: I,
139-
max_cgu_count: usize,
140144
usage_map: &UsageMap<'tcx>,
141145
) -> Vec<CodegenUnit<'tcx>>
142146
where
143147
I: Iterator<Item = MonoItem<'tcx>>,
144148
{
145149
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning");
146150

147-
let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map };
151+
let cx = &PartitioningCx { tcx, usage_map };
148152

149153
// In the first step, we place all regular monomorphizations into their
150154
// respective 'home' codegen unit. Regular monomorphizations are all
151155
// functions and statics defined in the local crate.
152-
let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = {
156+
let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = {
153157
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots");
154158
place_root_mono_items(cx, mono_items)
155159
};
@@ -158,15 +162,15 @@ where
158162
cgu.create_size_estimate(tcx);
159163
}
160164

161-
debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units);
165+
debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats);
162166

163167
// Merge until we have at most `max_cgu_count` codegen units.
164168
// `merge_codegen_units` is responsible for updating the CGU size
165169
// estimates.
166170
{
167171
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
168172
merge_codegen_units(cx, &mut codegen_units);
169-
debug_dump(tcx, "POST MERGING", &codegen_units);
173+
debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
170174
}
171175

172176
// In the next step, we use the inlining map to determine which additional
@@ -182,7 +186,7 @@ where
182186
cgu.create_size_estimate(tcx);
183187
}
184188

185-
debug_dump(tcx, "POST INLINING", &codegen_units);
189+
debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
186190

187191
// Next we try to make as many symbols "internal" as possible, so LLVM has
188192
// more freedom to optimize.
@@ -226,7 +230,7 @@ where
226230
// Ensure CGUs are sorted by name, so that we get deterministic results.
227231
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
228232

229-
debug_dump(tcx, "FINAL", &codegen_units);
233+
debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats);
230234

231235
codegen_units
232236
}
@@ -252,10 +256,16 @@ where
252256
let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
253257
let cgu_name_cache = &mut FxHashMap::default();
254258

259+
let mut num_unique_inlined_items = 0;
260+
let mut unique_inlined_items_size = 0;
255261
for mono_item in mono_items {
256262
match mono_item.instantiation_mode(cx.tcx) {
257263
InstantiationMode::GloballyShared { .. } => {}
258-
InstantiationMode::LocalCopy => continue,
264+
InstantiationMode::LocalCopy => {
265+
num_unique_inlined_items += 1;
266+
unique_inlined_items_size += mono_item.size_estimate(cx.tcx);
267+
continue;
268+
}
259269
}
260270

261271
let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item);
@@ -300,7 +310,11 @@ where
300310
let mut codegen_units: Vec<_> = codegen_units.into_values().collect();
301311
codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str()));
302312

303-
PlacedRootMonoItems { codegen_units, internalization_candidates }
313+
PlacedRootMonoItems {
314+
codegen_units,
315+
internalization_candidates,
316+
unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size),
317+
}
304318
}
305319

306320
// This function requires the CGUs to be sorted by name on input, and ensures
@@ -309,7 +323,7 @@ fn merge_codegen_units<'tcx>(
309323
cx: &PartitioningCx<'_, 'tcx>,
310324
codegen_units: &mut Vec<CodegenUnit<'tcx>>,
311325
) {
312-
assert!(cx.target_cgu_count >= 1);
326+
assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);
313327

314328
// A sorted order here ensures merging is deterministic.
315329
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
@@ -318,11 +332,32 @@ fn merge_codegen_units<'tcx>(
318332
let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
319333
codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
320334

321-
// Merge the two smallest codegen units until the target size is
322-
// reached.
323-
while codegen_units.len() > cx.target_cgu_count {
324-
// Sort small cgus to the back
335+
// Having multiple CGUs can drastically speed up compilation. But for
336+
// non-incremental builds, tiny CGUs slow down compilation *and* result in
337+
// worse generated code. So we don't allow CGUs smaller than this (unless
338+
// there is just one CGU, of course). Note that CGU sizes of 100,000+ are
339+
// common in larger programs, so this isn't all that large.
340+
const NON_INCR_MIN_CGU_SIZE: usize = 1000;
341+
342+
// Repeatedly merge the two smallest codegen units as long as:
343+
// - we have more CGUs than the upper limit, or
344+
// - (Non-incremental builds only) the user didn't specify a CGU count, and
345+
// there are multiple CGUs, and some are below the minimum size.
346+
//
347+
// The "didn't specify a CGU count" condition is because when an explicit
348+
// count is requested we observe it as closely as possible. For example,
349+
// the `compiler_builtins` crate sets `codegen-units = 10000` and it's
350+
// critical they aren't merged. Also, some tests use explicit small values
351+
// and likewise won't work if small CGUs are merged.
352+
while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
353+
|| (cx.tcx.sess.opts.incremental.is_none()
354+
&& matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
355+
&& codegen_units.len() > 1
356+
&& codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
357+
{
358+
// Sort small cgus to the back.
325359
codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
360+
326361
let mut smallest = codegen_units.pop().unwrap();
327362
let second_smallest = codegen_units.last_mut().unwrap();
328363

@@ -814,47 +849,147 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit
814849
}
815850
}
816851

817-
fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) {
852+
fn debug_dump<'a, 'tcx: 'a>(
853+
tcx: TyCtxt<'tcx>,
854+
label: &str,
855+
cgus: &[CodegenUnit<'tcx>],
856+
(unique_inlined_items, unique_inlined_size): (usize, usize),
857+
) {
818858
let dump = move || {
819859
use std::fmt::Write;
820860

821-
let num_cgus = cgus.len();
822-
let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum();
823-
let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum();
824-
let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap();
825-
let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap();
826-
let max_min_size_ratio = max_size as f64 / min_size as f64;
861+
let mut num_cgus = 0;
862+
let mut all_cgu_sizes = Vec::new();
863+
864+
// Note: every unique root item is placed exactly once, so the number
865+
// of unique root items always equals the number of placed root items.
866+
867+
let mut root_items = 0;
868+
// unique_inlined_items is passed in above.
869+
let mut placed_inlined_items = 0;
870+
871+
let mut root_size = 0;
872+
// unique_inlined_size is passed in above.
873+
let mut placed_inlined_size = 0;
874+
875+
for cgu in cgus.iter() {
876+
num_cgus += 1;
877+
all_cgu_sizes.push(cgu.size_estimate());
878+
879+
for (item, _) in cgu.items() {
880+
match item.instantiation_mode(tcx) {
881+
InstantiationMode::GloballyShared { .. } => {
882+
root_items += 1;
883+
root_size += item.size_estimate(tcx);
884+
}
885+
InstantiationMode::LocalCopy => {
886+
placed_inlined_items += 1;
887+
placed_inlined_size += item.size_estimate(tcx);
888+
}
889+
}
890+
}
891+
}
892+
893+
all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
894+
895+
let unique_items = root_items + unique_inlined_items;
896+
let placed_items = root_items + placed_inlined_items;
897+
let items_ratio = placed_items as f64 / unique_items as f64;
898+
899+
let unique_size = root_size + unique_inlined_size;
900+
let placed_size = root_size + placed_inlined_size;
901+
let size_ratio = placed_size as f64 / unique_size as f64;
902+
903+
let mean_cgu_size = placed_size as f64 / num_cgus as f64;
904+
905+
assert_eq!(placed_size, all_cgu_sizes.iter().sum::<usize>());
827906

828907
let s = &mut String::new();
908+
let _ = writeln!(s, "{label}");
829909
let _ = writeln!(
830910
s,
831-
"{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \
832-
max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):"
911+
"- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \
912+
unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\
913+
- placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \
914+
placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\
915+
- placed/unique items ratio: {items_ratio:.2}, \
916+
placed/unique size ratio: {size_ratio:.2}\n\
917+
- CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}",
918+
list(&all_cgu_sizes),
833919
);
920+
let _ = writeln!(s);
921+
834922
for (i, cgu) in cgus.iter().enumerate() {
923+
let name = cgu.name();
924+
let size = cgu.size_estimate();
835925
let num_items = cgu.items().len();
836-
let _ = writeln!(
837-
s,
838-
"- CGU[{i}] {} ({num_items} items, size={}):",
839-
cgu.name(),
840-
cgu.size_estimate()
841-
);
926+
let mean_size = size as f64 / num_items as f64;
927+
928+
let mut placed_item_sizes: Vec<_> =
929+
cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect();
930+
placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
931+
let sizes = list(&placed_item_sizes);
932+
933+
let _ = writeln!(s, "- CGU[{i}]");
934+
let _ = writeln!(s, " - {name}, size: {size}");
935+
let _ =
936+
writeln!(s, " - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",);
842937

843938
for (item, linkage) in cgu.items_in_deterministic_order(tcx) {
844939
let symbol_name = item.symbol_name(tcx).name;
845940
let symbol_hash_start = symbol_name.rfind('h');
846941
let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
847942
let size = item.size_estimate(tcx);
943+
let kind = match item.instantiation_mode(tcx) {
944+
InstantiationMode::GloballyShared { .. } => "root",
945+
InstantiationMode::LocalCopy => "inlined",
946+
};
848947
let _ = with_no_trimmed_paths!(writeln!(
849948
s,
850-
" - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
949+
" - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
851950
));
852951
}
853952

854953
let _ = writeln!(s);
855954
}
856955

857-
std::mem::take(s)
956+
return std::mem::take(s);
957+
958+
// Converts a slice to a string, capturing repetitions to save space.
959+
// E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]".
960+
fn list(ns: &[usize]) -> String {
961+
let mut v = Vec::new();
962+
if ns.is_empty() {
963+
return "[]".to_string();
964+
}
965+
966+
let mut elem = |curr, curr_count| {
967+
if curr_count == 1 {
968+
v.push(format!("{curr}"));
969+
} else {
970+
v.push(format!("{curr} (x{curr_count})"));
971+
}
972+
};
973+
974+
let mut curr = ns[0];
975+
let mut curr_count = 1;
976+
977+
for &n in &ns[1..] {
978+
if n != curr {
979+
elem(curr, curr_count);
980+
curr = n;
981+
curr_count = 1;
982+
} else {
983+
curr_count += 1;
984+
}
985+
}
986+
elem(curr, curr_count);
987+
988+
let mut s = "[".to_string();
989+
s.push_str(&v.join(", "));
990+
s.push_str("]");
991+
s
992+
}
858993
};
859994

860995
debug!("{}", dump());
@@ -922,8 +1057,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
9221057
let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || {
9231058
sync::join(
9241059
|| {
925-
let mut codegen_units =
926-
partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map);
1060+
let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map);
9271061
codegen_units[0].make_primary();
9281062
&*tcx.arena.alloc_from_iter(codegen_units)
9291063
},

0 commit comments

Comments
 (0)