Skip to content

Commit

Permalink
fix: add fix for slicing single array into many pages but don't have …
Browse files Browse the repository at this point in the history
…enough rows to do so (#2702)
  • Loading branch information
westonpace authored Aug 7, 2024
1 parent a349b6d commit d5247d0
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 1 deletion.
2 changes: 2 additions & 0 deletions rust/lance-encoding/src/encodings/logical/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,8 @@ impl PrimitiveFieldEncoder {
let array = arrays.into_iter().next().unwrap();
let size_bytes = array.get_buffer_memory_size();
let num_parts = bit_util::ceil(size_bytes, self.max_page_bytes as usize);
// Can't slice it finer than 1 page per row
let num_parts = num_parts.min(array.len());
if num_parts <= 1 {
// One part and it fits in a page
Ok(vec![self.create_encode_task(vec![array])?])
Expand Down
23 changes: 23 additions & 0 deletions rust/lance-encoding/src/encodings/physical/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,29 @@ pub mod tests {
.await;
}

#[test_log::test(tokio::test)]
async fn test_few_rows_bigger_than_max_page_size() {
// Create an array with one single 32MiB string
let big_string = String::from_iter((0..(32 * 1024 * 1024)).map(|_| '0'));
let string_array = StringArray::from(vec![
Some(big_string),
Some("abc".to_string()),
None,
None,
Some("xyz".to_string()),
]);

// Drop the max page size to 1MiB
let test_cases = TestCases::default().with_max_page_size(1024 * 1024);

check_round_trip_encoding_of_data(
vec![Arc::new(string_array)],
&test_cases,
HashMap::new(),
)
.await;
}

#[test_log::test(tokio::test)]
async fn test_empty_strings() {
// Scenario 1: Some strings are empty
Expand Down
13 changes: 12 additions & 1 deletion rust/lance-encoding/src/testing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ pub struct TestCases {
indices: Vec<Vec<u64>>,
batch_size: u32,
skip_validation: bool,
max_page_size: Option<u64>,
}

impl Default for TestCases {
Expand All @@ -209,6 +210,7 @@ impl Default for TestCases {
ranges: Vec::new(),
indices: Vec::new(),
skip_validation: false,
max_page_size: None,
}
}
}
Expand All @@ -233,6 +235,15 @@ impl TestCases {
self.skip_validation = true;
self
}

pub fn with_max_page_size(mut self, max_page_size: u64) -> Self {
self.max_page_size = Some(max_page_size);
self
}

fn get_max_page_size(&self) -> u64 {
self.max_page_size.unwrap_or(MAX_PAGE_BYTES)
}
}

/// Given specific data and test cases we check round trip encoding and decoding
Expand All @@ -255,7 +266,7 @@ pub async fn check_round_trip_encoding_of_data(
let mut column_index_seq = ColumnIndexSequence::default();
let encoding_options = EncodingOptions {
cache_bytes_per_column: page_size,
max_page_bytes: MAX_PAGE_BYTES,
max_page_bytes: test_cases.get_max_page_size(),
keep_original_array: true,
};
let encoder = encoding_strategy
Expand Down

0 comments on commit d5247d0

Please sign in to comment.