Skip to content

Commit

Permalink
Implement parallel string merging using buckets
Browse files Browse the repository at this point in the history
Implements: davidlattimore#117
  • Loading branch information
marxin committed Sep 10, 2024
1 parent b866f45 commit 33efe8e
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 45 deletions.
36 changes: 12 additions & 24 deletions wild/tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@
//! that are tested by examining the resulting binaries. Directives have the format '//#Directive:
//! Args'.
//!
//! ExpectComment: Checks that the the next comment in the .comment section is equal to the supplied
//! ExpectComment: Checks that the comment in the .comment section is equal to the supplied
//! argument. If no ExpectComment directives are given then .comment isn't checked. The argument may
//! end with '*' which matches anything. The last ExpectComment directive may start with '?' to
//! indicate that the comment if present should match the rest of the argument, but that it's OK for
//! it to be absent.
//! end with '*' which matches anything.
//!
//! TODO: Document the rest of the directives.

Expand Down Expand Up @@ -973,29 +971,19 @@ impl Assertions {
return Ok(());
}
let actual_comments = read_comments(obj)?;
let mut actual_comments_iter = actual_comments.iter();
let mut expected_comments = self.expected_comments.iter();
loop {
match (expected_comments.next(), actual_comments_iter.next()) {
(None, None) => break,
(None, Some(a)) => bail!("Unexpected .comment `{a}`"),
(Some(e), None) => {
if !e.starts_with('?') {
bail!("Missing expected .comment `{e}`")
}
}
(Some(e), Some(a)) => {
let e = e.strip_prefix('?').unwrap_or(e.as_str());
if let Some(prefix) = e.strip_suffix('*') {
if !a.starts_with(prefix) {
bail!("Expected .comment starting with `{prefix}`, got `{a}`");
}
} else if e != a {
bail!("Expected .comment `{e}`, got `{a}`");
}
for expected in self.expected_comments.iter() {
if let Some(expected) = expected.strip_suffix('*') {
if !actual_comments
.iter()
.any(|actual| actual.starts_with(expected))
{
bail!("Expected .comment starting with `{expected}`");
}
} else if !actual_comments.iter().any(|actual| actual == expected) {
bail!("Expected .comment `{expected}`");
}
}

Ok(())
}

Expand Down
3 changes: 0 additions & 3 deletions wild/tests/sources/comments.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,3 @@ void _start(void) {
//#ExpectComment:GCC*
//#ExpectComment:Foo
//#ExpectComment:Bar

// Our linker writes this, but GNU ld doesn't, so it's optional.
//#ExpectComment:?Linker*
8 changes: 5 additions & 3 deletions wild_lib/src/elf_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1922,9 +1922,11 @@ impl PreludeLayout {
if merged.len() > 0 {
let buffer =
buffers.get_mut(section_id.part_id_with_alignment(crate::alignment::MIN));
for string in &merged.strings {
let dest = crate::slice::slice_take_prefix_mut(buffer, string.len());
dest.copy_from_slice(string)
for bucket in &merged.buckets {
for string in &bucket.strings {
let dest = crate::slice::slice_take_prefix_mut(buffer, string.len());
dest.copy_from_slice(string)
}
}
}
});
Expand Down
4 changes: 4 additions & 0 deletions wild_lib/src/hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ impl<T> PreHashed<T> {
pub(crate) fn new(value: T, hash: u64) -> Self {
Self { value, hash }
}

pub(crate) fn hash(&self) -> u64 {
self.hash
}
}

impl<T> std::hash::Hash for PreHashed<T> {
Expand Down
86 changes: 71 additions & 15 deletions wild_lib/src/resolution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::input_data::InputRef;
use crate::input_data::PRELUDE_FILE_ID;
use crate::input_data::UNINITIALISED_FILE_ID;
use crate::output_section_id::CustomSectionDetails;
use crate::output_section_id::OutputSectionId;
use crate::output_section_id::OutputSections;
use crate::output_section_id::OutputSectionsBuilder;
use crate::output_section_id::SectionName;
Expand Down Expand Up @@ -43,7 +44,11 @@ use linker_utils::elf::SectionFlags;
use linker_utils::elf::SectionType;
use object::read::elf::Sym as _;
use object::LittleEndian;
use rayon::iter::ParallelBridge;
use rayon::iter::ParallelIterator;
use std::collections::HashMap;
use std::fmt::Display;
use std::hash::Hash;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;
use std::thread::Thread;
Expand Down Expand Up @@ -459,10 +464,12 @@ pub(crate) struct MergeStringsFileSection<'data> {
pub(crate) section_data: &'data [u8],
}

const MERGE_STRING_BUCKETS: usize = 32;

/// Information about a string-merge section prior to merging.
pub(crate) struct UnresolvedMergeStringsFileSection<'data> {
section_index: object::SectionIndex,
strings: Vec<PreHashed<StringToMerge<'data>>>,
buckets: [Vec<PreHashed<StringToMerge<'data>>>; MERGE_STRING_BUCKETS],
}

#[derive(PartialEq, Eq, Clone, Copy, Debug)]
Expand All @@ -471,7 +478,7 @@ pub(crate) struct StringToMerge<'data> {
}

#[derive(Default)]
pub(crate) struct MergeStringsSection<'data> {
pub(crate) struct MergeStringsSectionBucket<'data> {
/// The strings in this section in order. Includes null terminators.
pub(crate) strings: Vec<&'data [u8]>,

Expand All @@ -486,9 +493,9 @@ pub(crate) struct MergeStringsSection<'data> {
pub(crate) string_offsets: PassThroughHashMap<StringToMerge<'data>, u64>,
}

impl<'data> MergeStringsSection<'data> {
impl<'data> MergeStringsSectionBucket<'data> {
/// Adds `string`, deduplicating with an existing string if an identical string is already
/// present. Returns the offset into the section.
/// present. Returns the offset within this bucket.
fn add_string(&mut self, string: PreHashed<StringToMerge<'data>>) -> u64 {
self.totally_added += string.bytes.len();
*self.string_offsets.entry(string).or_insert_with(|| {
Expand All @@ -508,14 +515,39 @@ impl<'data> MergeStringsSection<'data> {
}
}

#[derive(Default)]
pub(crate) struct MergeStringsSection<'data> {
/// The buckets based on the hash value of the input string.
pub(crate) buckets: [MergeStringsSectionBucket<'data>; MERGE_STRING_BUCKETS],

/// The byte offset of each bucket in the final section.
pub(crate) bucket_offsets: [u64; MERGE_STRING_BUCKETS],
}

impl<'data> MergeStringsSection<'data> {
pub(crate) fn get(&self, string: &PreHashed<StringToMerge<'data>>) -> Option<u64> {
let bucket_index = (string.hash() as usize) % MERGE_STRING_BUCKETS;
self.buckets[bucket_index]
.get(string)
.map(|offset| self.bucket_offsets[bucket_index] + offset)
}

pub(crate) fn len(&self) -> u64 {
self.bucket_offsets[MERGE_STRING_BUCKETS - 1]
+ self.buckets[MERGE_STRING_BUCKETS - 1].next_offset
}
}

/// Merges identical strings from all loaded objects where those strings are from input sections
/// that are marked with both the SHF_MERGE and SHF_STRINGS flags.
#[tracing::instrument(skip_all, name = "Merge strings")]
fn merge_strings<'data>(
resolved: &mut [ResolvedGroup<'data>],
output_sections: &OutputSections,
) -> Result<OutputSectionMap<MergeStringsSection<'data>>> {
let mut strings_by_section = output_sections.new_section_map::<MergeStringsSection>();
let mut worklist_per_section: HashMap<OutputSectionId, [Vec<_>; MERGE_STRING_BUCKETS]> =
HashMap::new();

for group in resolved {
for file in &mut group.files {
let ResolvedFile::Object(obj) = file else {
Expand All @@ -531,19 +563,42 @@ fn merge_strings<'data>(
bail!("Internal error: expected SectionSlot::MergeStrings");
};

let string_to_offset = strings_by_section.get_mut(sec.part_id.output_section_id());
for string in &merge_info.strings {
string_to_offset.add_string(*string);
let id = sec.part_id.output_section_id();
worklist_per_section.entry(id).or_default();
for (i, bucket) in worklist_per_section
.get_mut(&id)
.unwrap()
.iter_mut()
.enumerate()
{
bucket.push(&merge_info.buckets[i]);
}
}
}
}

strings_by_section.for_each(|section_id, sec| {
if sec.len() > 0 {
tracing::debug!(section = ?output_sections.name(section_id), size = sec.len(), sec.totally_added, strings = sec.strings.len(), "merge_strings");
let mut strings_by_section = output_sections.new_section_map::<MergeStringsSection>();

for (section_id, buckets) in worklist_per_section.iter() {
let merged_strings = strings_by_section.get_mut(*section_id);

buckets
.iter()
.zip(merged_strings.buckets.iter_mut())
.par_bridge()
.for_each(|(string_lists, merged_strings)| {
for strings in string_lists {
for string in strings.iter() {
merged_strings.add_string(*string);
}
}
});

for i in 1..MERGE_STRING_BUCKETS {
merged_strings.bucket_offsets[i] =
merged_strings.bucket_offsets[i - 1] + merged_strings.buckets[i - 1].len();
}
});
}

Ok(strings_by_section)
}
Expand Down Expand Up @@ -962,13 +1017,14 @@ impl<'data> UnresolvedMergeStringsFileSection<'data> {
section_index: object::SectionIndex,
) -> Result<UnresolvedMergeStringsFileSection<'data>> {
let mut remaining = section_data;
let mut strings = Vec::new();
let mut buckets: [Vec<PreHashed<StringToMerge>>; MERGE_STRING_BUCKETS] = Default::default();
while !remaining.is_empty() {
strings.push(StringToMerge::take_hashed(&mut remaining)?);
let string = StringToMerge::take_hashed(&mut remaining)?;
buckets[(string.hash() as usize) % MERGE_STRING_BUCKETS].push(string);
}
Ok(UnresolvedMergeStringsFileSection {
section_index,
strings,
buckets,
})
}
}
Expand Down

0 comments on commit 33efe8e

Please sign in to comment.