Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ydb/core/formats/arrow/arrow_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,4 +611,12 @@ std::optional<ui32> TColumnFilter::GetFilteredCount() const {
return *FilteredCount;
}

void TColumnFilter::Append(const TColumnFilter& filter) {
bool currentVal = filter.GetStartValue();
for (auto&& i : filter.Filter) {
Add(currentVal, i);
currentVal = !currentVal;
}
}

}
1 change: 1 addition & 0 deletions ydb/core/formats/arrow/arrow_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class TColumnFilter {
FilteredCount.reset();
}
public:
void Append(const TColumnFilter& filter);
void Add(const bool value, const ui32 count = 1);
std::optional<ui32> GetFilteredCount() const;
const std::vector<bool>& BuildSimpleFilter() const;
Expand Down
31 changes: 31 additions & 0 deletions ydb/core/formats/arrow/arrow_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -977,4 +977,35 @@ std::shared_ptr<arrow::RecordBatch> MergeColumns(const std::vector<std::shared_p
return arrow::RecordBatch::Make(schema, *recordsCount, columns);
}

std::vector<std::shared_ptr<arrow::RecordBatch>> SliceToRecordBatches(const std::shared_ptr<arrow::Table>& t) {
std::set<ui32> splitPositions;
const ui32 numRows = t->num_rows();
for (auto&& i : t->columns()) {
ui32 pos = 0;
for (auto&& arr : i->chunks()) {
splitPositions.emplace(pos);
pos += arr->length();
}
AFL_VERIFY(pos == t->num_rows());
}
std::vector<std::vector<std::shared_ptr<arrow::Array>>> slicedData;
slicedData.resize(splitPositions.size());
std::vector<ui32> positions(splitPositions.begin(), splitPositions.end());
for (auto&& i : t->columns()) {
for (ui32 idx = 0; idx < positions.size(); ++idx) {
auto slice = i->Slice(positions[idx], ((idx + 1 == positions.size()) ? numRows : positions[idx + 1]) - positions[idx]);
AFL_VERIFY(slice->num_chunks() == 1);
slicedData[idx].emplace_back(slice->chunks().front());
}
}
std::vector<std::shared_ptr<arrow::RecordBatch>> result;
ui32 count = 0;
for (auto&& i : slicedData) {
result.emplace_back(arrow::RecordBatch::Make(t->schema(), i.front()->length(), i));
count += result.back()->num_rows();
}
AFL_VERIFY(count == t->num_rows())("count", count)("t", t->num_rows());
return result;
}

}
2 changes: 2 additions & 0 deletions ydb/core/formats/arrow/arrow_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ inline bool HasNulls(const std::shared_ptr<arrow::Array>& column) {
return column->null_bitmap_data();
}

std::vector<std::shared_ptr<arrow::RecordBatch>> SliceToRecordBatches(const std::shared_ptr<arrow::Table>& t);

bool ArrayScalarsEqual(const std::shared_ptr<arrow::Array>& lhs, const std::shared_ptr<arrow::Array>& rhs);
std::shared_ptr<arrow::Array> BoolVecToArray(const std::vector<bool>& vec);

Expand Down
18 changes: 12 additions & 6 deletions ydb/core/formats/arrow/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -871,12 +871,18 @@ std::shared_ptr<NArrow::TColumnFilter> TProgramStep::BuildFilter(const std::shar
if (Filters.empty()) {
return nullptr;
}
auto datumBatch = TDatumBatch::FromTable(t);

NArrow::TStatusValidator::Validate(ApplyAssignes(*datumBatch, NArrow::GetCustomExecContext()));
NArrow::TColumnFilter local = NArrow::TColumnFilter::BuildAllowFilter();
NArrow::TStatusValidator::Validate(MakeCombinedFilter(*datumBatch, local));
return std::make_shared<NArrow::TColumnFilter>(std::move(local));
std::vector<std::shared_ptr<arrow::RecordBatch>> batches = NArrow::SliceToRecordBatches(t);
NArrow::TColumnFilter fullLocal = NArrow::TColumnFilter::BuildAllowFilter();
for (auto&& rb : batches) {
auto datumBatch = TDatumBatch::FromRecordBatch(rb);
NArrow::TStatusValidator::Validate(ApplyAssignes(*datumBatch, NArrow::GetCustomExecContext()));
NArrow::TColumnFilter local = NArrow::TColumnFilter::BuildAllowFilter();
NArrow::TStatusValidator::Validate(MakeCombinedFilter(*datumBatch, local));
AFL_VERIFY(local.Size() == datumBatch->Rows)("local", local.Size())("datum", datumBatch->Rows);
fullLocal.Append(local);
}
AFL_VERIFY(fullLocal.Size() == t->num_rows())("filter", fullLocal.Size())("t", t->num_rows());
return std::make_shared<NArrow::TColumnFilter>(std::move(fullLocal));
}

const std::set<ui32>& TProgramStep::GetFilterOriginalColumnIds() const {
Expand Down