Skip to content

Commit 79c44fe

Browse files
committed
Turbopack: improve compression dictionary generation
1 parent 6b89fe9 commit 79c44fe

File tree

1 file changed

+59
-43
lines changed

1 file changed

+59
-43
lines changed

turbopack/crates/turbo-persistence/src/static_sorted_file_builder.rs

Lines changed: 59 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
use std::{
2+
<<<<<<< HEAD
23
borrow::Cow,
34
cmp::min,
5+
||||||| parent of 1158fb094f (Turbopack: improve compression dictionary generation)
6+
cmp::min,
7+
=======
8+
cmp::{max, min},
9+
>>>>>>> 1158fb094f (Turbopack: improve compression dictionary generation)
410
fs::File,
511
io::{self, BufWriter, Seek, Write},
612
path::Path,
@@ -45,6 +51,8 @@ const MIN_VALUE_COMPRESSION_SAMPLES_SIZE: usize = 1024;
4551
const MIN_KEY_COMPRESSION_SAMPLES_SIZE: usize = 1024;
4652
/// The bytes that are used per key/value entry for a sample.
4753
const COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 100;
54+
/// The minimum bytes that are used per key/value entry for a sample.
55+
const MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY: usize = 16;
4856

4957
/// Trait for entries from that SST files can be created
5058
pub trait Entry {
@@ -153,58 +161,75 @@ impl<'a> StaticSortedFileBuilder<'a> {
153161
{
154162
return Ok(());
155163
}
156-
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 10);
164+
let key_compression_samples_size = min(KEY_COMPRESSION_SAMPLES_SIZE, total_key_size / 16);
157165
let value_compression_samples_size =
158-
min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 10);
166+
min(VALUE_COMPRESSION_SAMPLES_SIZE, total_value_size / 16);
159167
let mut value_samples = Vec::with_capacity(value_compression_samples_size);
160168
let mut value_sample_sizes = Vec::new();
161169
let mut key_samples = Vec::with_capacity(key_compression_samples_size);
162170
let mut key_sample_sizes = Vec::new();
163-
let mut i = 12345678 % entries.len();
164-
let mut j = 0;
165-
loop {
166-
let entry = &entries[i];
171+
172+
// Limit the number of iterations to avoid infinite loops
173+
let max_iterations =
174+
max(total_key_size, total_value_size) / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2;
175+
for i in 0..max_iterations {
176+
let entry = &entries[i % entries.len()];
167177
let value_remaining = value_compression_samples_size - value_samples.len();
168-
let key_remaining = key_compression_samples_size - key_samples.len();
169-
if value_remaining > 0
170-
&& let EntryValue::Small { value } | EntryValue::Medium { value } = entry.value()
171-
{
172-
let value = if value.len() <= COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
173-
value
174-
} else {
175-
j = (j + 12345678) % (value.len() - COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
176-
&value[j..j + COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY]
177-
};
178-
if value.len() <= value_remaining {
179-
value_sample_sizes.push(value.len());
180-
value_samples.extend_from_slice(value);
181-
} else {
182-
value_sample_sizes.push(value_remaining);
183-
value_samples.extend_from_slice(&value[..value_remaining]);
178+
if value_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
179+
break;
180+
}
181+
if let EntryValue::Small { value } | EntryValue::Medium { value } = entry.value() {
182+
let len = value.len();
183+
if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
184+
let used_len = min(value_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
185+
if len <= used_len {
186+
value_sample_sizes.push(len);
187+
value_samples.extend_from_slice(value);
188+
} else {
189+
value_sample_sizes.push(used_len);
190+
let p = value_samples.len() % (len - used_len);
191+
value_samples.extend_from_slice(&value[p..p + used_len]);
192+
};
184193
}
185194
}
186-
if key_remaining > 0 {
195+
}
196+
assert!(value_samples.len() == value_sample_sizes.iter().sum::<usize>());
197+
if value_samples.len() > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes.len() > 5
198+
{
199+
self.value_compression_dictionary = zstd::dict::from_continuous(
200+
&value_samples,
201+
&value_sample_sizes,
202+
VALUE_COMPRESSION_DICTIONARY_SIZE,
203+
)
204+
.context("Value dictionary creation failed")?;
205+
} else {
206+
self.value_compression_dictionary = Vec::new();
207+
}
208+
209+
for i in 0..max_iterations {
210+
let entry = &entries[i % entries.len()];
211+
let key_remaining = key_compression_samples_size - key_samples.len();
212+
if key_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
213+
break;
214+
}
215+
let len = entry.key_len();
216+
if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
187217
let used_len = min(key_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY);
188-
if entry.key_len() <= used_len {
189-
key_sample_sizes.push(entry.key_len());
218+
if len <= used_len {
219+
key_sample_sizes.push(len);
190220
entry.write_key_to(&mut key_samples);
191221
} else {
192-
let mut temp = Vec::with_capacity(entry.key_len());
222+
let mut temp = Vec::with_capacity(len);
193223
entry.write_key_to(&mut temp);
194-
debug_assert!(temp.len() == entry.key_len());
224+
debug_assert!(temp.len() == len);
195225

196-
j = (j + 12345678) % (temp.len() - used_len);
226+
let p = key_samples.len() % (len - used_len);
197227
key_sample_sizes.push(used_len);
198-
key_samples.extend_from_slice(&temp[j..j + used_len]);
228+
key_samples.extend_from_slice(&temp[p..p + used_len]);
199229
}
200230
}
201-
if key_remaining == 0 && value_remaining == 0 {
202-
break;
203-
}
204-
i = (i + 12345678) % entries.len();
205231
}
206232
assert!(key_samples.len() == key_sample_sizes.iter().sum::<usize>());
207-
assert!(value_samples.len() == value_sample_sizes.iter().sum::<usize>());
208233
if key_samples.len() > MIN_KEY_COMPRESSION_SAMPLES_SIZE && key_sample_sizes.len() > 5 {
209234
self.key_compression_dictionary = zstd::dict::from_continuous(
210235
&key_samples,
@@ -213,15 +238,6 @@ impl<'a> StaticSortedFileBuilder<'a> {
213238
)
214239
.context("Key dictionary creation failed")?;
215240
}
216-
if value_samples.len() > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes.len() > 5
217-
{
218-
self.value_compression_dictionary = zstd::dict::from_continuous(
219-
&value_samples,
220-
&value_sample_sizes,
221-
VALUE_COMPRESSION_DICTIONARY_SIZE,
222-
)
223-
.context("Value dictionary creation failed")?;
224-
}
225241
Ok(())
226242
}
227243

0 commit comments

Comments
 (0)