diff --git a/src/lib/ParquetReader.cpp b/src/lib/ParquetReader.cpp index f068354..ca251d1 100644 --- a/src/lib/ParquetReader.cpp +++ b/src/lib/ParquetReader.cpp @@ -227,9 +227,6 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) { int64_t chunk_start = cc.has_dictionary ? dictionary_page_offset : data_page_offset; - // Give a chance to R to allocate memory for the column chunk - alloc_column_chunk(cc); - // read in the whole chunk BufferGuard tmp_buf_g = bufman_cc->claim(); ByteBuffer &tmp_buf = tmp_buf_g.buf; @@ -239,6 +236,25 @@ void ParquetReader::read_column_chunk_int(ColumnChunk &cc) { uint8_t *ptr = (uint8_t*) tmp_buf.ptr; uint8_t *end = ptr + cmd.total_compressed_size; + // Polars does not set dictionary_page_offset :( + // https://github.com/r-lib/nanoparquet/issues/132 + // We need to do this fix before calling alloc_column_chunk(), so the + // callback correctly knows if this chunk has a dictionary page. + // Sadly, this means that we are parsing the header of the first data + // page twice, for files that adhere to the spec and don't have dict + // pages. :(( + if (!cc.has_dictionary) { + PageHeader dph; + uint32_t ph_size = cmd.total_compressed_size; + thrift_unpack(ptr, &ph_size, &dph, filename_); + if (dph.type == parquet::PageType::DICTIONARY_PAGE) { + cc.has_dictionary = true; + } + } + + // Give a chance to R to allocate memory for the column chunk + alloc_column_chunk(cc); + // dictionary page, if any if (cc.has_dictionary) { PageHeader dph; diff --git a/tests/testthat/data/broken/polars-no-dict-offset.parquet b/tests/testthat/data/broken/polars-no-dict-offset.parquet new file mode 100644 index 0000000..a231b06 Binary files /dev/null and b/tests/testthat/data/broken/polars-no-dict-offset.parquet differ diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 329290d..7aaf8f4 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -226,3 +226,12 @@ test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", { bs2[is.na(t1[,2])] <- NA expect_equal(t1[,2], bs2) }) + +# https://github.com/r-lib/nanoparquet/issues/132 +test_that("dict page w/o dict offset set", { + pf <- test_path("data/broken/polars-no-dict-offset.parquet") + expect_equal( + as.data.frame(read_parquet(pf)), + data.frame(a = c(1,2,3), b = c(4,5,6)) + ) +})