diff --git a/cpp/src/arrow/array/array_dict.h b/cpp/src/arrow/array/array_dict.h index c87606f7caf..ce1f49ce5fa 100644 --- a/cpp/src/arrow/array/array_dict.h +++ b/cpp/src/arrow/array/array_dict.h @@ -75,6 +75,12 @@ class ARROW_EXPORT DictionaryArray : public Array { const std::shared_ptr& type, const std::shared_ptr& indices, const std::shared_ptr& dictionary); + static Result> FromArrays( + const std::shared_ptr& indices, const std::shared_ptr& dictionary) { + return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices, + dictionary); + } + /// \brief Transpose this DictionaryArray /// /// This method constructs a new dictionary array with the given dictionary diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 5b886e3e17e..da79a7d4baa 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -2689,6 +2689,39 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) { ::arrow::AssertTablesEqual(*expected_table, *result, false); } +TEST(TestArrowReadWrite, NonUniqueDictionaryValues) { + // ARROW-10237 + auto dict_with_dupes = ArrayFromJSON(::arrow::utf8(), R"(["a", "a", "b"])"); + // test with all valid 4-long `indices` + for (int i = 0; i < 4 * 4 * 4 * 4; ++i) { + int j = i; + ASSERT_OK_AND_ASSIGN( + auto indices, + ArrayFromBuilderVisitor(::arrow::int32(), 4, [&](::arrow::Int32Builder* b) { + if (j % 4 < dict_with_dupes->length()) { + b->UnsafeAppend(j % 4); + } else { + b->UnsafeAppendNull(); + } + j /= 4; + })); + ASSERT_OK_AND_ASSIGN(auto plain, ::arrow::compute::Take(*dict_with_dupes, *indices)); + ASSERT_OK_AND_ASSIGN(auto encoded, + ::arrow::DictionaryArray::FromArrays(indices, dict_with_dupes)); + + auto table = Table::Make(::arrow::schema({::arrow::field("d", encoded->type())}), + ::arrow::ArrayVector{encoded}); + + ASSERT_OK(table->ValidateFull()); + + std::shared_ptr round_tripped; + ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip(table, true, 20, {}, &round_tripped)); + + ASSERT_OK(round_tripped->ValidateFull()); + ::arrow::AssertArraysEqual(*plain, *round_tripped->column(0)->chunk(0), true); + } +} + TEST(TestArrowWrite, CheckChunkSize) { const int num_columns = 2; const int num_rows = 128; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 4be8bb67361..eb3942f29c1 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1393,6 +1393,14 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( // It's a new dictionary. Call PutDictionary and keep track of it PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary)); + // If there were duplicate value in the dictionary, the encoder's memo table + // will be out of sync with the indices in the Arrow array. + // The easiest solution for this uncommon case is to fallback to plain encoding. + if (dict_encoder->num_entries() != dictionary->length()) { + PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding()); + return WriteDense(); + } + // TODO(wesm): If some dictionary values are unobserved, then the // statistics will be inaccurate. Do we care enough to fix it? if (page_statistics_ != nullptr) {