Skip to content

Commit e76e5b2

Browse files
committed
Turbopack: remove value compression dictionary
It doesn't have benefit as each block is already large as we merge small values together and medium size values are already large. This also gives the benefit that we do not need to recompress medium value blocks when doing compaction
1 parent aef6cf5 commit e76e5b2

File tree

10 files changed

+90
-146
lines changed

10 files changed

+90
-146
lines changed

turbopack/crates/turbo-persistence-tools/src/main.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ fn main() -> Result<()> {
3535
amqf_entries,
3636
sst_size,
3737
key_compression_dictionary_size,
38-
value_compression_dictionary_size,
3938
block_count,
4039
} in meta_file.entries
4140
{
@@ -45,15 +44,11 @@ fn main() -> Result<()> {
4544
);
4645
println!(" AMQF {amqf_entries} entries = {} KiB", amqf_size / 1024);
4746
println!(
48-
" {} KiB = {} kiB key compression dict + {} KiB value compression dict + \
49-
{block_count} blocks (avg {} bytes/block)",
47+
" {} KiB = {} kiB key compression dict + {block_count} blocks (avg {} \
48+
bytes/block)",
5049
sst_size / 1024,
5150
key_compression_dictionary_size / 1024,
52-
value_compression_dictionary_size / 1024,
53-
(sst_size
54-
- key_compression_dictionary_size as u64
55-
- value_compression_dictionary_size as u64)
56-
/ block_count as u64
51+
(sst_size - key_compression_dictionary_size as u64) / block_count as u64
5752
);
5853
}
5954
if !meta_file.obsolete_sst_files.is_empty() {

turbopack/crates/turbo-persistence/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ A meta file can contain metadata about multiple SST files. The metadata is store
4545
- foreach described SST file
4646
- 4 bytes sequence number of the SST file
4747
- 2 bytes key Compression Dictionary length
48-
- 2 bytes value Compression Dictionary length
4948
- 2 bytes block count
5049
- 8 bytes min hash
5150
- 8 bytes max hash
@@ -59,7 +58,6 @@ A meta file can contain metadata about multiple SST files. The metadata is store
5958
The SST file contains only data without any header.
6059

6160
- serialized key Compression Dictionary
62-
- serialized value Compression Dictionary
6361
- foreach block
6462
- 4 bytes uncompressed block length
6563
- compressed data

turbopack/crates/turbo-persistence/src/collector.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,11 @@ impl<K: StoreKey, const SIZE_SHIFT: usize> Collector<K, SIZE_SHIFT> {
9292
self.entries.push(entry);
9393
}
9494

95-
/// Sorts the entries and returns them along with the total key and value sizes. This doesn't
95+
/// Sorts the entries and returns them along with the total key size. This doesn't
9696
/// clear the entries.
97-
pub fn sorted(&mut self) -> (&[CollectorEntry<K>], usize, usize) {
97+
pub fn sorted(&mut self) -> (&[CollectorEntry<K>], usize) {
9898
self.entries.sort_unstable_by(|a, b| a.key.cmp(&b.key));
99-
(&self.entries, self.total_key_size, self.total_value_size)
99+
(&self.entries, self.total_key_size)
100100
}
101101

102102
/// Clears the collector.

turbopack/crates/turbo-persistence/src/db.rs

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -903,8 +903,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
903903
amqf,
904904
key_compression_dictionary_length: entry
905905
.key_compression_dictionary_length(),
906-
value_compression_dictionary_length: entry
907-
.value_compression_dictionary_length(),
908906
block_count: entry.block_count(),
909907
size: entry.size(),
910908
entries: 0,
@@ -918,7 +916,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
918916
fn create_sst_file<'l>(
919917
entries: &[LookupEntry<'l>],
920918
total_key_size: usize,
921-
total_value_size: usize,
922919
path: &Path,
923920
seq: u32,
924921
) -> Result<(u32, File, StaticSortedFileBuilderMeta<'static>)>
@@ -928,7 +925,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
928925
let (meta, file) = write_static_stored_file(
929926
entries,
930927
total_key_size,
931-
total_value_size,
932928
&path.join(format!("{seq:08}.sst")),
933929
)?;
934930
Ok((seq, file, meta))
@@ -962,7 +958,7 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
962958
let mut current: Option<LookupEntry<'_>> = None;
963959
let mut entries = Vec::new();
964960
let mut last_entries = Vec::new();
965-
let mut last_entries_total_sizes = (0, 0);
961+
let mut last_entries_total_key_size = 0;
966962
for entry in iter {
967963
let entry = entry?;
968964

@@ -979,15 +975,11 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
979975
> DATA_THRESHOLD_PER_COMPACTED_FILE
980976
|| entries.len() >= MAX_ENTRIES_PER_COMPACTED_FILE
981977
{
982-
let (
983-
selected_total_key_size,
984-
selected_total_value_size,
985-
) = last_entries_total_sizes;
978+
let selected_total_key_size =
979+
last_entries_total_key_size;
986980
swap(&mut entries, &mut last_entries);
987-
last_entries_total_sizes = (
988-
total_key_size - key_size,
989-
total_value_size - value_size,
990-
);
981+
last_entries_total_key_size =
982+
total_key_size - key_size;
991983
total_key_size = key_size;
992984
total_value_size = value_size;
993985

@@ -1000,7 +992,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
1000992
new_sst_files.push(create_sst_file(
1001993
&entries,
1002994
selected_total_key_size,
1003-
selected_total_value_size,
1004995
path,
1005996
seq,
1006997
)?);
@@ -1018,7 +1009,8 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
10181009
}
10191010
if let Some(entry) = current {
10201011
total_key_size += entry.key.len();
1021-
total_value_size += entry.value.uncompressed_size_in_sst();
1012+
// Obsolete as we no longer need total_value_size
1013+
// total_value_size += entry.value.uncompressed_size_in_sst();
10221014
entries.push(entry);
10231015
}
10241016

@@ -1030,7 +1022,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
10301022
new_sst_files.push(create_sst_file(
10311023
&entries,
10321024
total_key_size,
1033-
total_value_size,
10341025
path,
10351026
seq,
10361027
)?);
@@ -1041,8 +1032,7 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
10411032
if !last_entries.is_empty() {
10421033
last_entries.append(&mut entries);
10431034

1044-
last_entries_total_sizes.0 += total_key_size;
1045-
last_entries_total_sizes.1 += total_value_size;
1035+
last_entries_total_key_size += total_key_size;
10461036

10471037
let (part1, part2) =
10481038
last_entries.split_at(last_entries.len() / 2);
@@ -1054,17 +1044,15 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
10541044
new_sst_files.push(create_sst_file(
10551045
part1,
10561046
// We don't know the exact sizes so we estimate them
1057-
last_entries_total_sizes.0 / 2,
1058-
last_entries_total_sizes.1 / 2,
1047+
last_entries_total_key_size / 2,
10591048
path,
10601049
seq1,
10611050
)?);
10621051

10631052
keys_written += part2.len() as u64;
10641053
new_sst_files.push(create_sst_file(
10651054
part2,
1066-
last_entries_total_sizes.0 / 2,
1067-
last_entries_total_sizes.1 / 2,
1055+
last_entries_total_key_size / 2,
10681056
path,
10691057
seq2,
10701058
)?);
@@ -1264,8 +1252,6 @@ impl<S: ParallelScheduler> TurboPersistence<S> {
12641252
amqf_entries: amqf.len(),
12651253
key_compression_dictionary_size: entry
12661254
.key_compression_dictionary_length(),
1267-
value_compression_dictionary_size: entry
1268-
.value_compression_dictionary_length(),
12691255
block_count: entry.block_count(),
12701256
}
12711257
})
@@ -1303,6 +1289,5 @@ pub struct MetaFileEntryInfo {
13031289
pub amqf_entries: usize,
13041290
pub sst_size: u64,
13051291
pub key_compression_dictionary_size: u16,
1306-
pub value_compression_dictionary_size: u16,
13071292
pub block_count: u16,
13081293
}

turbopack/crates/turbo-persistence/src/lookup_entry.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ pub enum LazyLookupValue<'l> {
2222
Medium {
2323
uncompressed_size: u32,
2424
block: &'l [u8],
25-
dictionary: &'l [u8],
2625
},
2726
}
2827

@@ -79,11 +78,9 @@ impl Entry for LookupEntry<'_> {
7978
LazyLookupValue::Medium {
8079
uncompressed_size,
8180
block,
82-
dictionary,
8381
} => EntryValue::MediumCompressed {
8482
uncompressed_size: *uncompressed_size,
8583
block,
86-
dictionary,
8784
},
8885
}
8986
}

turbopack/crates/turbo-persistence/src/meta_file.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,6 @@ impl MetaEntry {
144144
self.sst_data.key_compression_dictionary_length
145145
}
146146

147-
pub fn value_compression_dictionary_length(&self) -> u16 {
148-
self.sst_data.value_compression_dictionary_length
149-
}
150-
151147
pub fn block_count(&self) -> u16 {
152148
self.sst_data.block_count
153149
}
@@ -222,7 +218,6 @@ impl MetaFile {
222218
sst_data: StaticSortedFileMetaData {
223219
sequence_number: file.read_u32::<BE>()?,
224220
key_compression_dictionary_length: file.read_u16::<BE>()?,
225-
value_compression_dictionary_length: file.read_u16::<BE>()?,
226221
block_count: file.read_u16::<BE>()?,
227222
},
228223
family,

turbopack/crates/turbo-persistence/src/meta_file_builder.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ impl<'a> MetaFileBuilder<'a> {
5858
for (sequence_number, sst) in &self.entries {
5959
file.write_u32::<BE>(*sequence_number)?;
6060
file.write_u16::<BE>(sst.key_compression_dictionary_length)?;
61-
file.write_u16::<BE>(sst.value_compression_dictionary_length)?;
6261
file.write_u16::<BE>(sst.block_count)?;
6362
file.write_u64::<BE>(sst.min_hash)?;
6463
file.write_u64::<BE>(sst.max_hash)?;

turbopack/crates/turbo-persistence/src/static_sorted_file.rs

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use std::{
1010

1111
use anyhow::{Context, Result, bail};
1212
use byteorder::{BE, ReadBytesExt};
13-
use lzzzz::lz4::decompress_with_dict;
13+
use lzzzz::lz4::{decompress, decompress_with_dict};
1414
use memmap2::Mmap;
1515
use quick_cache::sync::GuardResult;
1616
use rustc_hash::FxHasher;
@@ -67,8 +67,6 @@ pub struct StaticSortedFileMetaData {
6767
pub sequence_number: u32,
6868
/// The length of the key compression dictionary.
6969
pub key_compression_dictionary_length: u16,
70-
/// The length of the value compression dictionary.
71-
pub value_compression_dictionary_length: u16,
7270
/// The number of blocks in the SST file.
7371
pub block_count: u16,
7472
}
@@ -81,21 +79,14 @@ impl StaticSortedFileMetaData {
8179

8280
pub fn blocks_start(&self) -> usize {
8381
let k: usize = self.key_compression_dictionary_length.into();
84-
let v: usize = self.value_compression_dictionary_length.into();
85-
k + v
82+
k
8683
}
8784

8885
pub fn key_compression_dictionary_range(&self) -> Range<usize> {
8986
let start = 0;
9087
let end: usize = self.key_compression_dictionary_length.into();
9188
start..end
9289
}
93-
94-
pub fn value_compression_dictionary_range(&self) -> Range<usize> {
95-
let start = self.key_compression_dictionary_length as usize;
96-
let end = start + self.value_compression_dictionary_length as usize;
97-
start..end
98-
}
9990
}
10091

10192
/// A memory mapped SST file.
@@ -323,25 +314,34 @@ impl StaticSortedFile {
323314

324315
/// Reads a key block from the file.
325316
fn read_key_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
326-
self.read_block(
317+
self.read_block_with_dict(
327318
block_index,
328319
&self.mmap[self.meta.key_compression_dictionary_range()],
329320
)
330321
}
331322

332323
/// Reads a value block from the file.
333324
fn read_value_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
334-
self.read_block(
335-
block_index,
336-
&self.mmap[self.meta.value_compression_dictionary_range()],
337-
)
325+
self.read_block(block_index)
326+
}
327+
328+
/// Reads a block from the file.
329+
fn read_block_with_dict(
330+
&self,
331+
block_index: u16,
332+
compression_dictionary: &[u8],
333+
) -> Result<ArcSlice<u8>> {
334+
let (uncompressed_length, block) = self.get_compressed_block(block_index)?;
335+
336+
let buffer = decompress_into_arc(uncompressed_length, block, Some(compression_dictionary))?;
337+
Ok(ArcSlice::from(buffer))
338338
}
339339

340340
/// Reads a block from the file.
341-
fn read_block(&self, block_index: u16, compression_dictionary: &[u8]) -> Result<ArcSlice<u8>> {
341+
fn read_block(&self, block_index: u16) -> Result<ArcSlice<u8>> {
342342
let (uncompressed_length, block) = self.get_compressed_block(block_index)?;
343343

344-
let buffer = decompress_into_arc(uncompressed_length, block, compression_dictionary)?;
344+
let buffer = decompress_into_arc(uncompressed_length, block, None)?;
345345
Ok(ArcSlice::from(buffer))
346346
}
347347

@@ -486,8 +486,6 @@ impl<'l> StaticSortedFileIter<'l> {
486486
LazyLookupValue::Medium {
487487
uncompressed_size,
488488
block,
489-
dictionary: &self.this.mmap
490-
[self.this.meta.value_compression_dictionary_range()],
491489
}
492490
} else {
493491
let value = self
@@ -587,10 +585,10 @@ fn get_key_entry<'l>(
587585
})
588586
}
589587

590-
pub fn decompress_into_arc(
588+
fn decompress_into_arc(
591589
uncompressed_length: u32,
592590
block: &[u8],
593-
compression_dictionary: &[u8],
591+
compression_dictionary: Option<&[u8]>,
594592
) -> Result<Arc<[u8]>> {
595593
// We directly allocate the buffer in an Arc to avoid copying it into an Arc and avoiding
596594
// double indirection. This is a dynamically sized arc.
@@ -602,7 +600,11 @@ pub fn decompress_into_arc(
602600
// Safety: We know that the buffer is not shared yet.
603601
let decompressed = unsafe { Arc::get_mut_unchecked(&mut buffer) };
604602
// Safety: decompress_with_dict will only write to `decompressed` and not read from it.
605-
let bytes_writes = decompress_with_dict(block, decompressed, compression_dictionary)?;
603+
let bytes_writes = if let Some(dict) = compression_dictionary {
604+
decompress_with_dict(&block, decompressed, dict)?
605+
} else {
606+
decompress(&block, decompressed)?
607+
};
606608
assert_eq!(
607609
bytes_writes, uncompressed_length as usize,
608610
"Decompressed length does not match expected length"

0 commit comments

Comments
 (0)