diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index 6d7e1ebb..e0ff7aab 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -536,6 +536,13 @@ class TestParquetIO : public ::testing::Test { *out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists); } + // Prepare table of empty lists, with null values array (ARROW-2744) + void PrepareEmptyListsTable(int64_t size, std::shared_ptr* out) { + std::shared_ptr lists; + ASSERT_OK(MakeEmptyListsArray(size, &lists)); + *out = MakeSimpleTable(lists, true /* nullable_lists */); + } + void PrepareListOfListTable(int64_t size, bool nullable_parent_lists, bool nullable_lists, bool nullable_elements, int64_t null_count, std::shared_ptr
* out) { @@ -713,6 +720,12 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); } +TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) { + std::shared_ptr
table; + ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table)); + ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); +} + TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) { std::shared_ptr
table; ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table)); @@ -1524,8 +1537,6 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks, void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type, std::shared_ptr* out_array) { - ::arrow::Int32Builder offset_builder; - std::vector length_draws; randint(num_rows, 0, 100, &length_draws); diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index 7264324d..c70e0eff 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -394,6 +394,33 @@ Status MakeListArray(const std::shared_ptr& values, int64_t size, return Status::OK(); } +// Make an array containing only empty lists, with a null values array +Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { + // Allocate an offsets buffer containing only zeroes + std::shared_ptr offsets_buffer; + const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t); + RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes, + &offsets_buffer)); + memset(offsets_buffer->mutable_data(), 0, offsets_nbytes); + + auto value_field = ::arrow::field("item", ::arrow::float64(), + false /* nullable_values */); + auto list_type = ::arrow::list(value_field); + + std::vector> child_buffers = {nullptr /* null bitmap */, + nullptr /* values */ }; + auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0, + std::move(child_buffers)); + + std::vector> buffers = {nullptr /* bitmap */, + offsets_buffer }; + auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers)); + array_data->child_data.push_back(child_data); + + *out_array = ::arrow::MakeArray(array_data); + return Status::OK(); +} + static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name, const std::shared_ptr& array, bool nullable) { diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index 50b4649e..f7727381 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -411,8 +411,13 @@ Status ArrowColumnWriter::TypedWriteBatch(const Array& array, int64_t num_levels using ArrowCType = typename ArrowType::c_type; const auto& data = static_cast(array); - auto values = - reinterpret_cast(data.values()->data()) + data.offset(); + const ArrowCType* values = nullptr; + // The values buffer may be null if the array is empty (ARROW-2744) + if (data.values() != nullptr) { + values = reinterpret_cast(data.values()->data()) + data.offset(); + } else { + DCHECK_EQ(data.length(), 0); + } if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) { // no nulls, just dump the data @@ -706,7 +711,13 @@ Status ArrowColumnWriter::TypedWriteBatch( RETURN_NOT_OK(ctx_->GetScratchData(array.length(), &buffer)); const auto& data = static_cast(array); - auto values = reinterpret_cast(data.values()->data()); + const uint8_t* values = nullptr; + // The values buffer may be null if the array is empty (ARROW-2744) + if (data.values() != nullptr) { + values = reinterpret_cast(data.values()->data()); + } else { + DCHECK_EQ(data.length(), 0); + } int buffer_idx = 0; int64_t offset = array.offset();