From 596fde80769e8b38c7618c92bcc24aa174db8712 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui Date: Sat, 25 May 2024 14:25:36 +0800 Subject: [PATCH] Support internal setting sorted-mode --- cpp/src/arrow/compute/row/compare_test.cc | 49 +++++---- cpp/src/arrow/compute/row/encode_internal.cc | 5 +- cpp/src/arrow/compute/row/encode_internal.h | 2 +- cpp/src/arrow/compute/row/grouper.cc | 3 +- cpp/src/arrow/compute/row/row_internal.cc | 101 ++++++++++--------- cpp/src/arrow/compute/row/row_internal.h | 3 +- 6 files changed, 81 insertions(+), 82 deletions(-) diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 7c6f142f10a42..febf2657ee8aa 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -167,50 +167,43 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) { TEST(KeyCompare, CompareColumnsWithEncodingOrder) { const int num_rows = 5; - for (auto are_cols_sorted : {true, false}) { - SCOPED_TRACE("are_cols_sorted = " + std::to_string(are_cols_sorted)); + const auto i32_col = ArrayFromJSON(int32(), "[0, 1, 2, 3, 4]"); + const auto i64_col = ArrayFromJSON(int64(), "[7, 8, 9, 10, 11]"); + + std::vector batches = {ExecBatch({i32_col, i64_col}, num_rows), + ExecBatch({i64_col, i32_col}, num_rows)}; + int batch_idx = 0; + for (const auto& batch : batches) { + SCOPED_TRACE("batch idx = " + std::to_string(batch_idx)); MemoryPool* pool = default_memory_pool(); TempVectorStack stack; ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); - auto i32_col = ArrayFromJSON(int32(), "[0, 1, 2, 3, 4]"); - auto i64_col = ArrayFromJSON(int64(), "[7, 8, 9, 10, 11]"); - - // resorted order in RowTableMetadata will be : i64_col, i32_col - ExecBatch batch_right({i32_col, i64_col}, num_rows); - std::vector r_col_metas; - ASSERT_OK(ColumnMetadatasFromExecBatch(batch_right, &r_col_metas)); + ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &r_col_metas)); RowTableMetadata r_table_meta; - r_table_meta.FromColumnMetadataVector(r_col_metas, sizeof(uint64_t), sizeof(uint64_t), - are_cols_sorted); + r_table_meta.FromColumnMetadataVector(r_col_metas, sizeof(uint64_t), + sizeof(uint64_t)); std::vector r_column_arrays; - ASSERT_OK(ColumnArraysFromExecBatch(batch_right, &r_column_arrays)); + ASSERT_OK(ColumnArraysFromExecBatch(batch, &r_column_arrays)); RowTableImpl row_table; ASSERT_OK(row_table.Init(pool, r_table_meta)); RowTableEncoder row_encoder; - row_encoder.Init(r_col_metas, sizeof(uint64_t), sizeof(uint64_t), are_cols_sorted); + row_encoder.Init(r_col_metas, sizeof(uint64_t), sizeof(uint64_t)); row_encoder.PrepareEncodeSelected(0, num_rows, r_column_arrays); std::vector r_row_ids(num_rows); std::iota(r_row_ids.begin(), r_row_ids.end(), 0); ASSERT_OK(row_encoder.EncodeSelected(&row_table, num_rows, r_row_ids.data())); - ExecBatch batch_left; - if (are_cols_sorted) { - batch_left.values = {i64_col, i32_col}; - } else { - batch_left.values = {i32_col, i64_col}; - } - batch_left.length = num_rows; - std::vector l_column_arrays; - ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &l_column_arrays)); + // Input left batch should always be 'i64_col,i32_col' order. + ASSERT_OK(ColumnArraysFromExecBatch(batches[1], &l_column_arrays)); std::vector l_row_ids(num_rows); std::iota(l_row_ids.begin(), l_row_ids.end(), 0); @@ -221,10 +214,16 @@ TEST(KeyCompare, CompareColumnsWithEncodingOrder) { std::vector row_ids_out(num_rows); KeyCompare::CompareColumnsToRows( num_rows, NULLPTR, l_row_ids.data(), &ctx, &num_rows_no_match, row_ids_out.data(), - l_column_arrays, row_table, are_cols_sorted, NULLPTR); - // Because the data of batch_left and batch_right are the same, their comparison - // results should be the same regardless of whether are_cols_sorted is true or false. + l_column_arrays, row_table, r_table_meta.are_cols_sorted, NULLPTR); + // The data of these two batches are the same, their comparison results + // should be the same regardless of whether are_cols_sorted is true or false. ASSERT_EQ(num_rows_no_match, 0); + if (batch_idx == 0) { + ASSERT_EQ(row_encoder.row_metadata().are_cols_sorted, true); + } else if (batch_idx == 1) { + ASSERT_EQ(row_encoder.row_metadata().are_cols_sorted, false); + } + batch_idx++; } } diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 13a74bc790361..01d552ef8270f 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -22,9 +22,8 @@ namespace arrow { namespace compute { void RowTableEncoder::Init(const std::vector& cols, int row_alignment, - int string_alignment, bool are_columns_sorted) { - row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment, - are_columns_sorted); + int string_alignment) { + row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment); uint32_t num_cols = row_metadata_.num_cols(); uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols(); batch_all_cols_.resize(num_cols); diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 256a44ba0824b..2afc150530b9e 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -47,7 +47,7 @@ namespace compute { class ARROW_EXPORT RowTableEncoder { public: void Init(const std::vector& cols, int row_alignment, - int string_alignment, bool are_columns_sorted = true); + int string_alignment); const RowTableMetadata& row_metadata() { return row_metadata_; } // GrouperFastImpl right now needs somewhat intrusive visibility into RowTableEncoder diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index fd026599cfd73..930300aba5964 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -569,8 +569,7 @@ struct GrouperFastImpl : public Grouper { impl->encoder_.Init(impl->col_metadata_, /* row_alignment = */ sizeof(uint64_t), - /* string_alignment = */ sizeof(uint64_t), - /* are_columns_sorted = */ true); + /* string_alignment = */ sizeof(uint64_t)); RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata())); RETURN_NOT_OK( impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata())); diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 47302ab9ccb54..503ba5eb834b1 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -54,7 +54,7 @@ bool RowTableMetadata::is_compatible(const RowTableMetadata& other) const { void RowTableMetadata::FromColumnMetadataVector( const std::vector& cols, int in_row_alignment, - int in_string_alignment, bool in_are_cols_sorted) { + int in_string_alignment) { column_metadatas.resize(cols.size()); for (size_t i = 0; i < cols.size(); ++i) { column_metadatas[i] = cols[i]; @@ -62,64 +62,67 @@ void RowTableMetadata::FromColumnMetadataVector( const auto num_cols = static_cast(cols.size()); + // Sort columns. + // + // Columns are sorted based on the size in bytes of their fixed-length part. + // For the varying-length column, the fixed-length part is the 32-bit field storing + // cumulative length of varying-length fields. This is to make the memory access of + // each individual column within the encoded row alignment-friendly. + // + // The rules are: + // + // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length + // part of 1 byte. + // + // b) Columns with fixed-length part being power of 2 or multiple of row + // alignment precede other columns. They are sorted in decreasing order of the size of + // their fixed-length part. + // + // c) Fixed-length columns precede varying-length columns when + // both have the same size fixed-length part. + // column_order.resize(num_cols); for (uint32_t i = 0; i < num_cols; ++i) { column_order[i] = i; } - - if (in_are_cols_sorted) { - // Sort columns. - // - // Columns are sorted based on the size in bytes of their fixed-length part. - // For the varying-length column, the fixed-length part is the 32-bit field storing - // cumulative length of varying-length fields. This is to make the memory access of - // each individual column within the encoded row alignment-friendly. - // - // The rules are: - // - // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length - // part of 1 byte. - // - // b) Columns with fixed-length part being power of 2 or multiple of row - // alignment precede other columns. They are sorted in decreasing order of the size of - // their fixed-length part. - // - // c) Fixed-length columns precede varying-length columns when - // both have the same size fixed-length part. - // - std::sort( - column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) { - bool is_left_pow2 = !cols[left].is_fixed_length || - ARROW_POPCOUNT64(cols[left].fixed_length) <= 1; - bool is_right_pow2 = !cols[right].is_fixed_length || - ARROW_POPCOUNT64(cols[right].fixed_length) <= 1; - bool is_left_fixedlen = cols[left].is_fixed_length; - bool is_right_fixedlen = cols[right].is_fixed_length; - uint32_t width_left = - cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t); - uint32_t width_right = - cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t); - if (is_left_pow2 != is_right_pow2) { - return is_left_pow2; - } - if (!is_left_pow2) { - return left < right; - } - if (width_left != width_right) { - return width_left > width_right; - } - if (is_left_fixedlen != is_right_fixedlen) { - return is_left_fixedlen; - } + std::sort( + column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) { + bool is_left_pow2 = + !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1; + bool is_right_pow2 = !cols[right].is_fixed_length || + ARROW_POPCOUNT64(cols[right].fixed_length) <= 1; + bool is_left_fixedlen = cols[left].is_fixed_length; + bool is_right_fixedlen = cols[right].is_fixed_length; + uint32_t width_left = + cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t); + uint32_t width_right = + cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t); + if (is_left_pow2 != is_right_pow2) { + return is_left_pow2; + } + if (!is_left_pow2) { return left < right; - }); - } + } + if (width_left != width_right) { + return width_left > width_right; + } + if (is_left_fixedlen != is_right_fixedlen) { + return is_left_fixedlen; + } + return left < right; + }); + are_cols_sorted = false; inverse_column_order.resize(num_cols); for (uint32_t i = 0; i < num_cols; ++i) { inverse_column_order[column_order[i]] = i; + // Check whether the column_order has changed due to sorting, + // and the sorted column order will be used first for better + // performance in grouper's compare. + if (inverse_column_order[i] != column_order[i] && are_cols_sorted == false) { + are_cols_sorted = true; + } } - are_cols_sorted = in_are_cols_sorted; row_alignment = in_row_alignment; string_alignment = in_string_alignment; varbinary_end_array_offset = 0; diff --git a/cpp/src/arrow/compute/row/row_internal.h b/cpp/src/arrow/compute/row/row_internal.h index fcc3b5944aa7b..dfd80f8d4dd44 100644 --- a/cpp/src/arrow/compute/row/row_internal.h +++ b/cpp/src/arrow/compute/row/row_internal.h @@ -149,8 +149,7 @@ struct ARROW_EXPORT RowTableMetadata { /// \brief Populate this instance to describe `cols` with the given alignment void FromColumnMetadataVector(const std::vector& cols, - int in_row_alignment, int in_string_alignment, - bool in_are_cols_sorted = true); + int in_row_alignment, int in_string_alignment); /// \brief True if `other` has the same number of columns /// and each column has the same width (two variable length