Skip to content

Commit

Permalink
Support internal setting sorted-mode
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhangHuiGui committed May 30, 2024
1 parent ff8fd7d commit 596fde8
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 82 deletions.
49 changes: 24 additions & 25 deletions cpp/src/arrow/compute/row/compare_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,50 +167,43 @@ TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) {
TEST(KeyCompare, CompareColumnsWithEncodingOrder) {
const int num_rows = 5;

for (auto are_cols_sorted : {true, false}) {
SCOPED_TRACE("are_cols_sorted = " + std::to_string(are_cols_sorted));
const auto i32_col = ArrayFromJSON(int32(), "[0, 1, 2, 3, 4]");
const auto i64_col = ArrayFromJSON(int64(), "[7, 8, 9, 10, 11]");

std::vector<ExecBatch> batches = {ExecBatch({i32_col, i64_col}, num_rows),
ExecBatch({i64_col, i32_col}, num_rows)};
int batch_idx = 0;
for (const auto& batch : batches) {
SCOPED_TRACE("batch idx = " + std::to_string(batch_idx));

MemoryPool* pool = default_memory_pool();
TempVectorStack stack;
ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows)));

auto i32_col = ArrayFromJSON(int32(), "[0, 1, 2, 3, 4]");
auto i64_col = ArrayFromJSON(int64(), "[7, 8, 9, 10, 11]");

// resorted order in RowTableMetadata will be : i64_col, i32_col
ExecBatch batch_right({i32_col, i64_col}, num_rows);

std::vector<KeyColumnMetadata> r_col_metas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch_right, &r_col_metas));
ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &r_col_metas));

RowTableMetadata r_table_meta;
r_table_meta.FromColumnMetadataVector(r_col_metas, sizeof(uint64_t), sizeof(uint64_t),
are_cols_sorted);
r_table_meta.FromColumnMetadataVector(r_col_metas, sizeof(uint64_t),
sizeof(uint64_t));

std::vector<KeyColumnArray> r_column_arrays;
ASSERT_OK(ColumnArraysFromExecBatch(batch_right, &r_column_arrays));
ASSERT_OK(ColumnArraysFromExecBatch(batch, &r_column_arrays));

RowTableImpl row_table;
ASSERT_OK(row_table.Init(pool, r_table_meta));

RowTableEncoder row_encoder;
row_encoder.Init(r_col_metas, sizeof(uint64_t), sizeof(uint64_t), are_cols_sorted);
row_encoder.Init(r_col_metas, sizeof(uint64_t), sizeof(uint64_t));
row_encoder.PrepareEncodeSelected(0, num_rows, r_column_arrays);

std::vector<uint16_t> r_row_ids(num_rows);
std::iota(r_row_ids.begin(), r_row_ids.end(), 0);
ASSERT_OK(row_encoder.EncodeSelected(&row_table, num_rows, r_row_ids.data()));

ExecBatch batch_left;
if (are_cols_sorted) {
batch_left.values = {i64_col, i32_col};
} else {
batch_left.values = {i32_col, i64_col};
}
batch_left.length = num_rows;

std::vector<KeyColumnArray> l_column_arrays;
ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &l_column_arrays));
// Input left batch should always be 'i64_col,i32_col' order.
ASSERT_OK(ColumnArraysFromExecBatch(batches[1], &l_column_arrays));

std::vector<uint32_t> l_row_ids(num_rows);
std::iota(l_row_ids.begin(), l_row_ids.end(), 0);
Expand All @@ -221,10 +214,16 @@ TEST(KeyCompare, CompareColumnsWithEncodingOrder) {
std::vector<uint16_t> row_ids_out(num_rows);
KeyCompare::CompareColumnsToRows(
num_rows, NULLPTR, l_row_ids.data(), &ctx, &num_rows_no_match, row_ids_out.data(),
l_column_arrays, row_table, are_cols_sorted, NULLPTR);
// Because the data of batch_left and batch_right are the same, their comparison
// results should be the same regardless of whether are_cols_sorted is true or false.
l_column_arrays, row_table, r_table_meta.are_cols_sorted, NULLPTR);
// The data of these two batches are the same, their comparison results
// should be the same regardless of whether are_cols_sorted is true or false.
ASSERT_EQ(num_rows_no_match, 0);
if (batch_idx == 0) {
ASSERT_EQ(row_encoder.row_metadata().are_cols_sorted, true);
} else if (batch_idx == 1) {
ASSERT_EQ(row_encoder.row_metadata().are_cols_sorted, false);
}
batch_idx++;
}
}

Expand Down
5 changes: 2 additions & 3 deletions cpp/src/arrow/compute/row/encode_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ namespace arrow {
namespace compute {

void RowTableEncoder::Init(const std::vector<KeyColumnMetadata>& cols, int row_alignment,
int string_alignment, bool are_columns_sorted) {
row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment,
are_columns_sorted);
int string_alignment) {
row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
uint32_t num_cols = row_metadata_.num_cols();
uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
batch_all_cols_.resize(num_cols);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/row/encode_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace compute {
class ARROW_EXPORT RowTableEncoder {
public:
void Init(const std::vector<KeyColumnMetadata>& cols, int row_alignment,
int string_alignment, bool are_columns_sorted = true);
int string_alignment);

const RowTableMetadata& row_metadata() { return row_metadata_; }
// GrouperFastImpl right now needs somewhat intrusive visibility into RowTableEncoder
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/compute/row/grouper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -569,8 +569,7 @@ struct GrouperFastImpl : public Grouper {

impl->encoder_.Init(impl->col_metadata_,
/* row_alignment = */ sizeof(uint64_t),
/* string_alignment = */ sizeof(uint64_t),
/* are_columns_sorted = */ true);
/* string_alignment = */ sizeof(uint64_t));
RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
RETURN_NOT_OK(
impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
Expand Down
101 changes: 52 additions & 49 deletions cpp/src/arrow/compute/row/row_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,72 +54,75 @@ bool RowTableMetadata::is_compatible(const RowTableMetadata& other) const {

void RowTableMetadata::FromColumnMetadataVector(
const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
int in_string_alignment, bool in_are_cols_sorted) {
int in_string_alignment) {
column_metadatas.resize(cols.size());
for (size_t i = 0; i < cols.size(); ++i) {
column_metadatas[i] = cols[i];
}

const auto num_cols = static_cast<uint32_t>(cols.size());

// Sort columns.
//
// Columns are sorted based on the size in bytes of their fixed-length part.
// For the varying-length column, the fixed-length part is the 32-bit field storing
// cumulative length of varying-length fields. This is to make the memory access of
// each individual column within the encoded row alignment-friendly.
//
// The rules are:
//
// a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
// part of 1 byte.
//
// b) Columns with fixed-length part being power of 2 or multiple of row
// alignment precede other columns. They are sorted in decreasing order of the size of
// their fixed-length part.
//
// c) Fixed-length columns precede varying-length columns when
// both have the same size fixed-length part.
//
column_order.resize(num_cols);
for (uint32_t i = 0; i < num_cols; ++i) {
column_order[i] = i;
}

if (in_are_cols_sorted) {
// Sort columns.
//
// Columns are sorted based on the size in bytes of their fixed-length part.
// For the varying-length column, the fixed-length part is the 32-bit field storing
// cumulative length of varying-length fields. This is to make the memory access of
// each individual column within the encoded row alignment-friendly.
//
// The rules are:
//
// a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
// part of 1 byte.
//
// b) Columns with fixed-length part being power of 2 or multiple of row
// alignment precede other columns. They are sorted in decreasing order of the size of
// their fixed-length part.
//
// c) Fixed-length columns precede varying-length columns when
// both have the same size fixed-length part.
//
std::sort(
column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
bool is_left_pow2 = !cols[left].is_fixed_length ||
ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
bool is_right_pow2 = !cols[right].is_fixed_length ||
ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
bool is_left_fixedlen = cols[left].is_fixed_length;
bool is_right_fixedlen = cols[right].is_fixed_length;
uint32_t width_left =
cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
uint32_t width_right =
cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
if (is_left_pow2 != is_right_pow2) {
return is_left_pow2;
}
if (!is_left_pow2) {
return left < right;
}
if (width_left != width_right) {
return width_left > width_right;
}
if (is_left_fixedlen != is_right_fixedlen) {
return is_left_fixedlen;
}
std::sort(
column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
bool is_left_pow2 =
!cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
bool is_right_pow2 = !cols[right].is_fixed_length ||
ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
bool is_left_fixedlen = cols[left].is_fixed_length;
bool is_right_fixedlen = cols[right].is_fixed_length;
uint32_t width_left =
cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
uint32_t width_right =
cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
if (is_left_pow2 != is_right_pow2) {
return is_left_pow2;
}
if (!is_left_pow2) {
return left < right;
});
}
}
if (width_left != width_right) {
return width_left > width_right;
}
if (is_left_fixedlen != is_right_fixedlen) {
return is_left_fixedlen;
}
return left < right;
});
are_cols_sorted = false;
inverse_column_order.resize(num_cols);
for (uint32_t i = 0; i < num_cols; ++i) {
inverse_column_order[column_order[i]] = i;
// Check whether the column_order has changed due to sorting,
// and the sorted column order will be used first for better
// performance in grouper's compare.
if (inverse_column_order[i] != column_order[i] && are_cols_sorted == false) {
are_cols_sorted = true;
}
}

are_cols_sorted = in_are_cols_sorted;
row_alignment = in_row_alignment;
string_alignment = in_string_alignment;
varbinary_end_array_offset = 0;
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/compute/row/row_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,7 @@ struct ARROW_EXPORT RowTableMetadata {

/// \brief Populate this instance to describe `cols` with the given alignment
void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
int in_row_alignment, int in_string_alignment,
bool in_are_cols_sorted = true);
int in_row_alignment, int in_string_alignment);

/// \brief True if `other` has the same number of columns
/// and each column has the same width (two variable length
Expand Down

0 comments on commit 596fde8

Please sign in to comment.