Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Allow row_range to be treated as a clause #864

Merged
merged 25 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
45ca483
feat: Allow `row_range` to be treated as a clause
jjerphan Sep 14, 2023
ff2bce3
Use `signed_row_range` for consistency
jjerphan Sep 15, 2023
2fdea1c
Add row_range to ReadRequest
jjerphan Sep 15, 2023
ed2dfe0
Extend batch methods for row_range
jjerphan Sep 15, 2023
208ab56
Define PythonRowRangeClause using NamedTuple
jjerphan Sep 15, 2023
c102d5e
Move SignedRowRange down the python front-end
jjerphan Sep 15, 2023
df415aa
Allow to chain on `QueryBuilder._row_range`
jjerphan Sep 15, 2023
942cdf2
test: Add tests for row_range clause
jjerphan Sep 15, 2023
885136e
Removed SignedRowRange entirely
jjerphan Sep 18, 2023
86f2a23
python: Row Range normalization
jjerphan Sep 18, 2023
812c68a
test: Add a simple test for the RowRange clause
jjerphan Sep 18, 2023
2b996f8
Make RowRangeClause slices on rows instead
jjerphan Sep 18, 2023
e36903a
test: Fix test
jjerphan Sep 20, 2023
1ca6ba3
Merge branch 'master' into feat/read-row_range-clause
jjerphan Oct 3, 2023
53a012e
Wrap around negative indices
jjerphan Oct 5, 2023
a33bc6b
Rework test regarding range exclusivity
jjerphan Oct 5, 2023
e2b97e9
Revert changes to the C++ layer
jjerphan Oct 17, 2023
68e0fa7
test: Remove `test_normalize_row_range`
jjerphan Oct 17, 2023
74c7ea3
Address review comments
jjerphan Oct 19, 2023
02c026b
fixup! Address review comments
jjerphan Oct 19, 2023
27a4a44
docs: Reword comment
jjerphan Oct 19, 2023
8e4a390
wip: Edge case for empty ChunkedBufferImpl
jjerphan Oct 24, 2023
24f8e36
Merge branch 'master' into feat/read-row_range-clause
jjerphan Oct 31, 2023
8e16ae8
wip: Resolve segmentation fault on Segment truncation
jjerphan Nov 2, 2023
50bebc0
fix: Resolve segmentation fault on Segment truncation
jjerphan Nov 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions cpp/arcticdb/column_store/chunked_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,8 @@ template std::vector<ChunkedBufferImpl<3968>> split(const ChunkedBufferImpl<3968
// Inclusive of start_byte, exclusive of end_byte
template <size_t BlockSize>
ChunkedBufferImpl<BlockSize> truncate(const ChunkedBufferImpl<BlockSize>& input, size_t start_byte, size_t end_byte) {
internal::check<ErrorCode::E_ASSERTION_FAILURE>(
end_byte >= start_byte,
"ChunkedBufferImpl::truncate expects end_byte {} to be >= start_byte {}", end_byte, start_byte);
ARCTICDB_DEBUG(log::version(), "Truncating buffer of size {} between bytes {} and {}", input.bytes(), start_byte, end_byte);
const auto output_size = end_byte - start_byte;
const auto output_size = start_byte >= end_byte ? 0 : end_byte - start_byte;
// This is trivially extendable to use presized_in_blocks, but there is no use case for this right now, and
// copy_frame_data_to_buffer expects a contiguous buffer
auto output = ChunkedBufferImpl<BlockSize>::presized(output_size);
Expand Down
4 changes: 2 additions & 2 deletions cpp/arcticdb/column_store/memory_segment_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ class SegmentInMemoryImpl {

size_t num_fields() const { return descriptor().field_count(); }

size_t row_count() const { return size_t(row_id_ + 1); }
size_t row_count() const { return row_id_ + 1 < 0 ? 0 : size_t(row_id_ + 1); }

void clear() {
columns_.clear();
Expand Down Expand Up @@ -1141,7 +1141,7 @@ class SegmentInMemoryImpl {
size_t end_row) const {
auto num_values = end_row - start_row;
internal::check<ErrorCode::E_ASSERTION_FAILURE>(
is_sparse() || (start_row < row_count() && end_row <= row_count() && num_values > 0),
is_sparse() || (start_row < row_count() && end_row <= row_count()),
"Truncate bounds start_row={} end_row={} outside valid range {}", start_row, end_row, row_count());

auto output = std::make_shared<SegmentInMemoryImpl>();
Expand Down
8 changes: 4 additions & 4 deletions cpp/arcticdb/pipeline/frame_slice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,19 @@ namespace arcticdb::pipelines {
struct AxisRange : std::pair<std::size_t, std::size_t> {
using std::pair<std::size_t, std::size_t>::pair;

[[nodiscard]] auto diff() const {
return second - first;
[[nodiscard]] std::size_t diff() const {
return first > second ? 0 : second - first;
}

[[nodiscard]] bool contains(std::size_t v) const {
return first <= v && v < second;
}

[[nodiscard]] auto start() const {
[[nodiscard]] std::size_t start() const {
return first;
}

[[nodiscard]] auto end() const {
[[nodiscard]] std::size_t end() const {
return second;
}

Expand Down
9 changes: 8 additions & 1 deletion cpp/arcticdb/pipeline/query.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ namespace arcticdb::pipelines {

using FilterRange = std::variant<std::monostate, IndexRange, RowRange>;

/*
* A structure which is used to store the potentially negative values for indices of a row range
*/
struct SignedRowRange {
int64_t start_;
int64_t end_;
Expand All @@ -54,7 +57,11 @@ struct ReadQuery {
clauses_ = clauses;
}

void calculate_row_filter(int64_t total_rows) {
/*
* This is used to set the row filter to a row range not to perform a query of the index key
* to get the total number of rows in the index, preventing the cost of performing an extra request.
*/
void calculate_row_filter(int64_t total_rows) {
if (row_range.has_value()) {
size_t start = row_range->start_ >= 0 ?
std::min(row_range->start_, total_rows) :
Expand Down
44 changes: 38 additions & 6 deletions cpp/arcticdb/processing/clause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ std::vector<Composite<SliceAndKey>> RowRangeClause::structure_for_processing(
slice_and_keys.erase(std::remove_if(slice_and_keys.begin(), slice_and_keys.end(), [this](const SliceAndKey& slice_and_key) {
return slice_and_key.slice_.row_range.start() >= end_ || slice_and_key.slice_.row_range.end() <= start_;
}), slice_and_keys.end());
return structure_by_column_slice(slice_and_keys);
return structure_by_row_slice(slice_and_keys, start_from);
jjerphan marked this conversation as resolved.
Show resolved Hide resolved
}

Composite<ProcessingUnit> RowRangeClause::process(std::shared_ptr<Store> store,
Expand All @@ -657,10 +657,22 @@ Composite<ProcessingUnit> RowRangeClause::process(std::shared_ptr<Store> store,
if (end_ > row_range.start() && end_ < row_range.end()) {
end_row = end_ - (row_range.start());
}
auto seg = truncate_segment(slice_and_key.segment(store), start_row, end_row);
slice_and_key.slice_.adjust_rows(seg.row_count());
slice_and_key.slice_.adjust_columns(seg.descriptor().field_count() - seg.descriptor().index().field_count());
slice_and_key.segment_ = std::move(seg);
auto original_segment = slice_and_key.segment(store);
auto truncated_segment = truncate_segment(original_segment, start_row, end_row);
if(!truncated_segment.is_null()) {
const size_t truncated_segment_row_count = truncated_segment.row_count();
const size_t truncated_segment_column_count = (
truncated_segment.descriptor().field_count() -
truncated_segment.descriptor().index().field_count()
);

slice_and_key.slice_.adjust_rows(truncated_segment_row_count);
slice_and_key.slice_.adjust_columns(truncated_segment_column_count);
} else {
slice_and_key.slice_.adjust_rows(0u);
slice_and_key.slice_.adjust_columns(0u);
}
slice_and_key.segment_ = std::move(truncated_segment);
} // else all rows in the slice and key are required, do nothing
}
});
Expand All @@ -672,8 +684,10 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
switch(row_range_type_) {
case RowRangeType::HEAD:
if (n_ >= 0) {
start_ = 0;
end_ = std::min(n_, total_rows);
} else {
start_ = 0;
end_ = std::max(static_cast<int64_t>(0), total_rows + n_);
}
break;
Expand All @@ -686,13 +700,31 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
end_ = total_rows;
}
break;
case RowRangeType::RANGE:
// Wrap around negative indices.
start_ = (
user_provided_start_ >= 0 ?
std::min(user_provided_start_, total_rows) :
std::max(total_rows + user_provided_start_, static_cast<int64_t>(0))
);
end_ = (
user_provided_end_ >= 0 ?
std::min(user_provided_end_, total_rows) :
std::max(total_rows + user_provided_end_, static_cast<int64_t>(0))
);
break;

default:
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Unrecognised RowRangeType {}", static_cast<uint8_t>(row_range_type_));
}
}

std::string RowRangeClause::to_string() const {
return fmt::format("{} {}", row_range_type_ == RowRangeType::HEAD ? "HEAD" : "TAIL", n_);
if (row_range_type_ == RowRangeType::RANGE) {
return fmt::format("ROWRANGE: RANGE, start={}, end ={}", start_, end_);
}

return fmt::format("ROWRANGE: {}, n={}", row_range_type_ == RowRangeType::HEAD ? "HEAD" : "TAIL", n_);
}

std::vector<Composite<SliceAndKey>> DateRangeClause::structure_for_processing(
Expand Down
25 changes: 20 additions & 5 deletions cpp/arcticdb/processing/clause.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,16 +511,25 @@ struct ColumnStatsGenerationClause {
struct RowRangeClause {
enum class RowRangeType: uint8_t {
HEAD,
TAIL
TAIL,
RANGE
};

ClauseInfo clause_info_;
RowRangeType row_range_type_;
// As passed into head or tail
int64_t n_;

// Row range to keep. Zero-indexed, inclusive of start, exclusive of end
// Calculated from n, whether the RowRangeType is head or tail, and the total rows as passed in by set_processing_config
int64_t n_{0};

// User provided values, which are used to calculate start and end.
// Both can be provided with negative values to wrap indices.
int64_t user_provided_start_;
int64_t user_provided_end_;

// Row range to keep. Zero-indexed, inclusive of start, exclusive of end.
// If the RowRangeType is `HEAD` or `TAIL`, this is calculated from `n` and
// the total rows as passed in by `set_processing_config`.
// If the RowRangeType is `RANGE`, then start and end are set using the
// user-provided values as passed in by `set_processing_config`.
uint64_t start_{0};
uint64_t end_{0};

Expand All @@ -529,6 +538,12 @@ struct RowRangeClause {
n_(n) {
}

explicit RowRangeClause(int64_t start, int64_t end):
row_range_type_(RowRangeType::RANGE),
user_provided_start_(start),
user_provided_end_(end) {
}

RowRangeClause() = delete;

ARCTICDB_MOVE_COPY_DEFAULT(RowRangeClause)
Expand Down
4 changes: 3 additions & 1 deletion cpp/arcticdb/version/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,12 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept

py::enum_<RowRangeClause::RowRangeType>(version, "RowRangeType")
.value("HEAD", RowRangeClause::RowRangeType::HEAD)
.value("TAIL", RowRangeClause::RowRangeType::TAIL);
.value("TAIL", RowRangeClause::RowRangeType::TAIL)
.value("RANGE", RowRangeClause::RowRangeType::RANGE);

py::class_<RowRangeClause, std::shared_ptr<RowRangeClause>>(version, "RowRangeClause")
.def(py::init<RowRangeClause::RowRangeType, int64_t>())
.def(py::init<int64_t, int64_t>())
.def("__str__", &RowRangeClause::to_string);

py::class_<DateRangeClause, std::shared_ptr<DateRangeClause>>(version, "DateRangeClause")
Expand Down
Loading
Loading