From d5247d0108383161ac3cba2316d0ca57515cf6ab Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 7 Aug 2024 07:59:34 -0700 Subject: [PATCH] fix: add fix for slicing single array into many pages but don't have enough rows to do so (#2702) --- .../src/encodings/logical/primitive.rs | 2 ++ .../src/encodings/physical/binary.rs | 23 +++++++++++++++++++ rust/lance-encoding/src/testing.rs | 13 ++++++++++- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 7439dd5f4f..60df568ad7 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -440,6 +440,8 @@ impl PrimitiveFieldEncoder { let array = arrays.into_iter().next().unwrap(); let size_bytes = array.get_buffer_memory_size(); let num_parts = bit_util::ceil(size_bytes, self.max_page_bytes as usize); + // Can't slice it finer than 1 page per row + let num_parts = num_parts.min(array.len()); if num_parts <= 1 { // One part and it fits in a page Ok(vec![self.create_encode_task(vec![array])?]) diff --git a/rust/lance-encoding/src/encodings/physical/binary.rs b/rust/lance-encoding/src/encodings/physical/binary.rs index e2350b2474..d2a4437cdd 100644 --- a/rust/lance-encoding/src/encodings/physical/binary.rs +++ b/rust/lance-encoding/src/encodings/physical/binary.rs @@ -603,6 +603,29 @@ pub mod tests { .await; } + #[test_log::test(tokio::test)] + async fn test_few_rows_bigger_than_max_page_size() { + // Create an array with one single 32MiB string + let big_string = String::from_iter((0..(32 * 1024 * 1024)).map(|_| '0')); + let string_array = StringArray::from(vec![ + Some(big_string), + Some("abc".to_string()), + None, + None, + Some("xyz".to_string()), + ]); + + // Drop the max page size to 1MiB + let test_cases = TestCases::default().with_max_page_size(1024 * 1024); + + check_round_trip_encoding_of_data( + vec![Arc::new(string_array)], + &test_cases, + HashMap::new(), + ) + .await; + } + #[test_log::test(tokio::test)] async fn test_empty_strings() { // Scenario 1: Some strings are empty diff --git a/rust/lance-encoding/src/testing.rs b/rust/lance-encoding/src/testing.rs index 13ff775dad..0c5f905ca0 100644 --- a/rust/lance-encoding/src/testing.rs +++ b/rust/lance-encoding/src/testing.rs @@ -200,6 +200,7 @@ pub struct TestCases { indices: Vec>, batch_size: u32, skip_validation: bool, + max_page_size: Option, } impl Default for TestCases { @@ -209,6 +210,7 @@ impl Default for TestCases { ranges: Vec::new(), indices: Vec::new(), skip_validation: false, + max_page_size: None, } } } @@ -233,6 +235,15 @@ impl TestCases { self.skip_validation = true; self } + + pub fn with_max_page_size(mut self, max_page_size: u64) -> Self { + self.max_page_size = Some(max_page_size); + self + } + + fn get_max_page_size(&self) -> u64 { + self.max_page_size.unwrap_or(MAX_PAGE_BYTES) + } } /// Given specific data and test cases we check round trip encoding and decoding @@ -255,7 +266,7 @@ pub async fn check_round_trip_encoding_of_data( let mut column_index_seq = ColumnIndexSequence::default(); let encoding_options = EncodingOptions { cache_bytes_per_column: page_size, - max_page_bytes: MAX_PAGE_BYTES, + max_page_bytes: test_cases.get_max_page_size(), keep_original_array: true, }; let encoder = encoding_strategy