Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/src/arrow/array/array_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ class ARROW_EXPORT DictionaryArray : public Array {
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
const std::shared_ptr<Array>& dictionary);

static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
dictionary);
}

/// \brief Transpose this DictionaryArray
///
/// This method constructs a new dictionary array with the given dictionary
Expand Down
33 changes: 33 additions & 0 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2689,6 +2689,39 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) {
::arrow::AssertTablesEqual(*expected_table, *result, false);
}

TEST(TestArrowReadWrite, NonUniqueDictionaryValues) {
// ARROW-10237
auto dict_with_dupes = ArrayFromJSON(::arrow::utf8(), R"(["a", "a", "b"])");
// test with all valid 4-long `indices`
for (int i = 0; i < 4 * 4 * 4 * 4; ++i) {
int j = i;
ASSERT_OK_AND_ASSIGN(
auto indices,
ArrayFromBuilderVisitor(::arrow::int32(), 4, [&](::arrow::Int32Builder* b) {
if (j % 4 < dict_with_dupes->length()) {
b->UnsafeAppend(j % 4);
} else {
b->UnsafeAppendNull();
}
j /= 4;
}));
ASSERT_OK_AND_ASSIGN(auto plain, ::arrow::compute::Take(*dict_with_dupes, *indices));
ASSERT_OK_AND_ASSIGN(auto encoded,
::arrow::DictionaryArray::FromArrays(indices, dict_with_dupes));

auto table = Table::Make(::arrow::schema({::arrow::field("d", encoded->type())}),
::arrow::ArrayVector{encoded});

ASSERT_OK(table->ValidateFull());

std::shared_ptr<Table> round_tripped;
ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, true, 20, {}, &round_tripped));

ASSERT_OK(round_tripped->ValidateFull());
::arrow::AssertArraysEqual(*plain, *round_tripped->column(0)->chunk(0), true);
}
}

TEST(TestArrowWrite, CheckChunkSize) {
const int num_columns = 2;
const int num_rows = 128;
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1393,6 +1393,14 @@ Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
// It's a new dictionary. Call PutDictionary and keep track of it
PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));

// If there were duplicate value in the dictionary, the encoder's memo table
// will be out of sync with the indices in the Arrow array.
// The easiest solution for this uncommon case is to fallback to plain encoding.
if (dict_encoder->num_entries() != dictionary->length()) {
PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
return WriteDense();
}

// TODO(wesm): If some dictionary values are unobserved, then the
// statistics will be inaccurate. Do we care enough to fix it?
if (page_statistics_ != nullptr) {
Expand Down