11use std:: {
2+ <<<<<<< HEAD
23 borrow:: Cow ,
34 cmp:: min ,
5+ ||||||| parent of 1158 fb094f ( Turbopack : improve compression dictionary generation )
6+ cmp:: min ,
7+ =======
8+ cmp:: { max , min } ,
9+ >>>>>>> 1158 fb094f ( Turbopack : improve compression dictionary generation )
410 fs:: File ,
511 io:: { self , BufWriter , Seek , Write } ,
612 path:: Path ,
@@ -45,6 +51,8 @@ const MIN_VALUE_COMPRESSION_SAMPLES_SIZE: usize = 1024;
4551const MIN_KEY_COMPRESSION_SAMPLES_SIZE : usize = 1024 ;
4652/// The bytes that are used per key/value entry for a sample.
4753const COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY : usize = 100 ;
54+ /// The minimum bytes that are used per key/value entry for a sample.
55+ const MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY : usize = 16 ;
4856
4957/// Trait for entries from that SST files can be created
5058pub trait Entry {
@@ -153,58 +161,75 @@ impl<'a> StaticSortedFileBuilder<'a> {
153161 {
154162 return Ok ( ( ) ) ;
155163 }
156- let key_compression_samples_size = min ( KEY_COMPRESSION_SAMPLES_SIZE , total_key_size / 10 ) ;
164+ let key_compression_samples_size = min ( KEY_COMPRESSION_SAMPLES_SIZE , total_key_size / 16 ) ;
157165 let value_compression_samples_size =
158- min ( VALUE_COMPRESSION_SAMPLES_SIZE , total_value_size / 10 ) ;
166+ min ( VALUE_COMPRESSION_SAMPLES_SIZE , total_value_size / 16 ) ;
159167 let mut value_samples = Vec :: with_capacity ( value_compression_samples_size) ;
160168 let mut value_sample_sizes = Vec :: new ( ) ;
161169 let mut key_samples = Vec :: with_capacity ( key_compression_samples_size) ;
162170 let mut key_sample_sizes = Vec :: new ( ) ;
163- let mut i = 12345678 % entries. len ( ) ;
164- let mut j = 0 ;
165- loop {
166- let entry = & entries[ i] ;
171+
172+ // Limit the number of iterations to avoid infinite loops
173+ let max_iterations =
174+ max ( total_key_size, total_value_size) / COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY * 2 ;
175+ for i in 0 ..max_iterations {
176+ let entry = & entries[ i % entries. len ( ) ] ;
167177 let value_remaining = value_compression_samples_size - value_samples. len ( ) ;
168- let key_remaining = key_compression_samples_size - key_samples. len ( ) ;
169- if value_remaining > 0
170- && let EntryValue :: Small { value } | EntryValue :: Medium { value } = entry. value ( )
171- {
172- let value = if value. len ( ) <= COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
173- value
174- } else {
175- j = ( j + 12345678 ) % ( value. len ( ) - COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY ) ;
176- & value[ j..j + COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY ]
177- } ;
178- if value. len ( ) <= value_remaining {
179- value_sample_sizes. push ( value. len ( ) ) ;
180- value_samples. extend_from_slice ( value) ;
181- } else {
182- value_sample_sizes. push ( value_remaining) ;
183- value_samples. extend_from_slice ( & value[ ..value_remaining] ) ;
178+ if value_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
179+ break ;
180+ }
181+ if let EntryValue :: Small { value } | EntryValue :: Medium { value } = entry. value ( ) {
182+ let len = value. len ( ) ;
183+ if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
184+ let used_len = min ( value_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY ) ;
185+ if len <= used_len {
186+ value_sample_sizes. push ( len) ;
187+ value_samples. extend_from_slice ( value) ;
188+ } else {
189+ value_sample_sizes. push ( used_len) ;
190+ let p = value_samples. len ( ) % ( len - used_len) ;
191+ value_samples. extend_from_slice ( & value[ p..p + used_len] ) ;
192+ } ;
184193 }
185194 }
186- if key_remaining > 0 {
195+ }
196+ assert ! ( value_samples. len( ) == value_sample_sizes. iter( ) . sum:: <usize >( ) ) ;
197+ if value_samples. len ( ) > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes. len ( ) > 5
198+ {
199+ self . value_compression_dictionary = zstd:: dict:: from_continuous (
200+ & value_samples,
201+ & value_sample_sizes,
202+ VALUE_COMPRESSION_DICTIONARY_SIZE ,
203+ )
204+ . context ( "Value dictionary creation failed" ) ?;
205+ } else {
206+ self . value_compression_dictionary = Vec :: new ( ) ;
207+ }
208+
209+ for i in 0 ..max_iterations {
210+ let entry = & entries[ i % entries. len ( ) ] ;
211+ let key_remaining = key_compression_samples_size - key_samples. len ( ) ;
212+ if key_remaining < MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
213+ break ;
214+ }
215+ let len = entry. key_len ( ) ;
216+ if len >= MIN_COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY {
187217 let used_len = min ( key_remaining, COMPRESSION_DICTIONARY_SAMPLE_PER_ENTRY ) ;
188- if entry . key_len ( ) <= used_len {
189- key_sample_sizes. push ( entry . key_len ( ) ) ;
218+ if len <= used_len {
219+ key_sample_sizes. push ( len ) ;
190220 entry. write_key_to ( & mut key_samples) ;
191221 } else {
192- let mut temp = Vec :: with_capacity ( entry . key_len ( ) ) ;
222+ let mut temp = Vec :: with_capacity ( len ) ;
193223 entry. write_key_to ( & mut temp) ;
194- debug_assert ! ( temp. len( ) == entry . key_len ( ) ) ;
224+ debug_assert ! ( temp. len( ) == len ) ;
195225
196- j = ( j + 12345678 ) % ( temp . len ( ) - used_len) ;
226+ let p = key_samples . len ( ) % ( len - used_len) ;
197227 key_sample_sizes. push ( used_len) ;
198- key_samples. extend_from_slice ( & temp[ j..j + used_len] ) ;
228+ key_samples. extend_from_slice ( & temp[ p..p + used_len] ) ;
199229 }
200230 }
201- if key_remaining == 0 && value_remaining == 0 {
202- break ;
203- }
204- i = ( i + 12345678 ) % entries. len ( ) ;
205231 }
206232 assert ! ( key_samples. len( ) == key_sample_sizes. iter( ) . sum:: <usize >( ) ) ;
207- assert ! ( value_samples. len( ) == value_sample_sizes. iter( ) . sum:: <usize >( ) ) ;
208233 if key_samples. len ( ) > MIN_KEY_COMPRESSION_SAMPLES_SIZE && key_sample_sizes. len ( ) > 5 {
209234 self . key_compression_dictionary = zstd:: dict:: from_continuous (
210235 & key_samples,
@@ -213,15 +238,6 @@ impl<'a> StaticSortedFileBuilder<'a> {
213238 )
214239 . context ( "Key dictionary creation failed" ) ?;
215240 }
216- if value_samples. len ( ) > MIN_VALUE_COMPRESSION_SAMPLES_SIZE && value_sample_sizes. len ( ) > 5
217- {
218- self . value_compression_dictionary = zstd:: dict:: from_continuous (
219- & value_samples,
220- & value_sample_sizes,
221- VALUE_COMPRESSION_DICTIONARY_SIZE ,
222- )
223- . context ( "Value dictionary creation failed" ) ?;
224- }
225241 Ok ( ( ) )
226242 }
227243
0 commit comments