From a7e65f4a7570910aa9ce24ac17824ae3c16c1918 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Thu, 8 Aug 2024 09:48:34 -0700 Subject: [PATCH] fix: v2 writer used the wrong value to calculate how many rows were available in array (#2704) --- .../src/encodings/logical/primitive.rs | 2 +- .../src/encodings/physical/binary.rs | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 60df568ad7..b29cba6559 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -454,7 +454,7 @@ impl PrimitiveFieldEncoder { let mut offset = 0; let part_size = bit_util::ceil(array.len(), num_parts); for _ in 0..num_parts { - let avail = array.len() - part_size; + let avail = array.len() - offset; let chunk_size = avail.min(part_size); let part = array.slice(offset, chunk_size); let task = self.create_encode_task(vec![part])?; diff --git a/rust/lance-encoding/src/encodings/physical/binary.rs b/rust/lance-encoding/src/encodings/physical/binary.rs index d2a4437cdd..1fd73b9932 100644 --- a/rust/lance-encoding/src/encodings/physical/binary.rs +++ b/rust/lance-encoding/src/encodings/physical/binary.rs @@ -604,7 +604,7 @@ pub mod tests { } #[test_log::test(tokio::test)] - async fn test_few_rows_bigger_than_max_page_size() { + async fn test_bigger_than_max_page_size() { // Create an array with one single 32MiB string let big_string = String::from_iter((0..(32 * 1024 * 1024)).map(|_| '0')); let string_array = StringArray::from(vec![ @@ -624,6 +624,19 @@ pub mod tests { HashMap::new(), ) .await; + + // This is a regression testing the case where a page with X rows is split into Y parts + // where the number of parts is not evenly divisible by the number of rows. In this + // case we are splitting 90 rows into 4 parts. + let big_string = String::from_iter((0..(1000 * 1000)).map(|_| '0')); + let string_array = StringArray::from_iter_values((0..90).map(|_| big_string.clone())); + + check_round_trip_encoding_of_data( + vec![Arc::new(string_array)], + &TestCases::default(), + HashMap::new(), + ) + .await; } #[test_log::test(tokio::test)]