diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 9ad5a2d6e8d..d371ef5de93 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -337,7 +337,8 @@ int64_t find_next_split(int64_t cur_pos, size_t cur_row_index, size_t cur_cumulative_size, cudf::host_span sizes, - size_t size_limit) + size_t size_limit, + size_t min_row_count) { auto const start = thrust::make_transform_iterator( sizes.begin(), @@ -357,7 +358,7 @@ int64_t find_next_split(int64_t cur_pos, // this guarantees that even if we cannot fit the set of rows represented by our where our cur_pos // is, we will still move forward instead of failing. while (split_pos < (static_cast(sizes.size()) - 1) && - (sizes[split_pos].end_row_index == cur_row_index)) { + (sizes[split_pos].end_row_index - cur_row_index < min_row_count)) { split_pos++; } @@ -657,8 +658,10 @@ std::tuple, size_t, size_t> compute_next_subpass( auto const start_index = find_start_index(h_aggregated_info, start_row); auto const cumulative_size = start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes; + // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all + // list columns have a clear start and end. auto const end_index = - find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit); + find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2); auto const end_row = h_aggregated_info[end_index].end_row_index; // for each column, collect the set of pages that spans start_row / end_row @@ -703,8 +706,8 @@ std::vector compute_page_splits_by_row(device_span 0; if (is_list && max_col_row < last_pass_row) { - size_t const min_col_row = static_cast(chunk.start_row + last_page.chunk_row); + auto const& first_page = subpass.pages[page_index]; + size_t const min_col_row = static_cast(chunk.start_row + first_page.chunk_row); CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass"); max_col_row--; }