Skip to content

Commit

Permalink
Merge pull request #134 from r-lib/fix/dictionary-offset-na
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi authored Feb 22, 2025
2 parents de7c6d8 + 2e3ee31 commit d3e6d73
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 3 deletions.
22 changes: 19 additions & 3 deletions src/lib/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,6 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
int64_t chunk_start =
cc.has_dictionary ? dictionary_page_offset : data_page_offset;

// Give a chance to R to allocate memory for the column chunk
alloc_column_chunk(cc);

// read in the whole chunk
BufferGuard tmp_buf_g = bufman_cc->claim();
ByteBuffer &tmp_buf = tmp_buf_g.buf;
Expand All @@ -239,6 +236,25 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) {
uint8_t *ptr = (uint8_t*) tmp_buf.ptr;
uint8_t *end = ptr + cmd.total_compressed_size;

// Polars does not set dictionary_page_offset :(
// https://github.com/r-lib/nanoparquet/issues/132
// We need to do this fix before calling alloc_column_chunk(), so the
// callback correctly knows if this chunk has a dictionary page.
// Sadly, this means that we are parsing the header of the first data
// page twice, for files that adhere to the spec and don't have dict
// pages. :((
if (!cc.has_dictionary) {
PageHeader dph;
uint32_t ph_size = cmd.total_compressed_size;
thrift_unpack(ptr, &ph_size, &dph, filename_);
if (dph.type == parquet::PageType::DICTIONARY_PAGE) {
cc.has_dictionary = true;
}
}

// Give a chance to R to allocate memory for the column chunk
alloc_column_chunk(cc);

// dictionary page, if any
if (cc.has_dictionary) {
PageHeader dph;
Expand Down
Binary file not shown.
9 changes: 9 additions & 0 deletions tests/testthat/test-read-parquet-5.R
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,12 @@ test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", {
bs2[is.na(t1[,2])] <- NA
expect_equal(t1[,2], bs2)
})

# https://github.com/r-lib/nanoparquet/issues/132
test_that("dict page w/o dict offset set", {
pf <- test_path("data/broken/polars-no-dict-offset.parquet")
expect_equal(
as.data.frame(read_parquet(pf)),
data.frame(a = c(1,2,3), b = c(4,5,6))
)
})

0 comments on commit d3e6d73

Please sign in to comment.