Skip to content

Commit

Permalink
feat: Allow row_range to be treated as a clause
Browse files Browse the repository at this point in the history
Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>
  • Loading branch information
jjerphan committed Sep 15, 2023
1 parent 9cd92a2 commit b83701b
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 10 deletions.
9 changes: 9 additions & 0 deletions cpp/arcticdb/processing/clause.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,10 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
switch(row_range_type_) {
case RowRangeType::HEAD:
if (n_ >= 0) {
start_ = 0;
end_ = std::min(n_, total_rows);
} else {
start_ = 0;
end_ = std::max(static_cast<int64_t>(0), total_rows + n_);
}
break;
Expand All @@ -684,6 +686,13 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
end_ = total_rows;
}
break;
case RowRangeType::RANGE:
if (start_ > end_) {
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("RowRangeClause start index {} is greater than end index {}", start_, end_);
}
n_ = end_ - start_;
break;

default:
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Unrecognised RowRangeType {}", static_cast<uint8_t>(row_range_type_));
}
Expand Down
11 changes: 9 additions & 2 deletions cpp/arcticdb/processing/clause.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,13 +511,14 @@ struct ColumnStatsGenerationClause {
struct RowRangeClause {
enum class RowRangeType: uint8_t {
HEAD,
TAIL
TAIL,
RANGE
};

ClauseInfo clause_info_;
RowRangeType row_range_type_;
// As passed into head or tail
int64_t n_;
int64_t n_{0};

// Row range to keep. Zero-indexed, inclusive of start, exclusive of end
// Calculated from n, whether the RowRangeType is head or tail, and the total rows as passed in by set_processing_config
Expand All @@ -529,6 +530,12 @@ struct RowRangeClause {
n_(n) {
}

explicit RowRangeClause(int64_t start, int64_t end):
row_range_type_(RowRangeType::RANGE),
start_(start),
end_(end) {
}

RowRangeClause() = delete;

ARCTICDB_MOVE_COPY_DEFAULT(RowRangeClause)
Expand Down
12 changes: 8 additions & 4 deletions cpp/arcticdb/version/python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,11 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
.def_property_readonly("diff", &pipelines::RowRange::diff);

py::class_<pipelines::SignedRowRange, std::shared_ptr<pipelines::SignedRowRange>>(version, "SignedRowRange")
.def(py::init([](int64_t start, int64_t end){
return SignedRowRange{start, end};
}));
.def(py::init([](int64_t start, int64_t end){
return SignedRowRange{start, end};
}))
.def_readwrite("start_", &pipelines::SignedRowRange::start_)
.def_readwrite("end_", &pipelines::SignedRowRange::end_);

py::class_<pipelines::ColRange, std::shared_ptr<pipelines::ColRange>>(version, "ColRange")
.def_property_readonly("start", &pipelines::ColRange::start)
Expand Down Expand Up @@ -288,10 +290,12 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept

py::enum_<RowRangeClause::RowRangeType>(version, "RowRangeType")
.value("HEAD", RowRangeClause::RowRangeType::HEAD)
.value("TAIL", RowRangeClause::RowRangeType::TAIL);
.value("TAIL", RowRangeClause::RowRangeType::TAIL)
.value("RANGE", RowRangeClause::RowRangeType::TAIL);

py::class_<RowRangeClause, std::shared_ptr<RowRangeClause>>(version, "RowRangeClause")
.def(py::init<RowRangeClause::RowRangeType, int64_t>())
.def(py::init<int64_t, int64_t>())
.def("__str__", &RowRangeClause::to_string);

py::class_<DateRangeClause, std::shared_ptr<DateRangeClause>>(version, "DateRangeClause")
Expand Down
10 changes: 9 additions & 1 deletion python/arcticdb/version_store/_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1592,10 +1592,12 @@ def read(
of the data that falls within the given range. The same effect can be achieved by using the date_range
clause of the QueryBuilder class, which will be slower, but return data with a smaller memory footprint.
See the QueryBuilder.date_range docstring for more details.
Only one of date_range or row_range can be provided.
row_range: `Optional[Tuple[int, int]]`, default=None
Row range to read data for. Inclusive of the lower bound, exclusive of the upper bound
lib.read(symbol, row_range=(start, end)).data should behave the same as df.iloc[start:end], including in
the handling of negative start/end values. Only one of date_range or row_range can be provided.
the handling of negative start/end values.
Only one of date_range or row_range can be provided.
columns: `Optional[List[str]]`, default=None
Applicable only for Pandas data. Determines which columns to return data for.
query_builder: 'Optional[QueryBuilder]', default=None
Expand All @@ -1610,9 +1612,15 @@ def read(

if row_range is not None:
row_range = _SignedRowRange(row_range[0], row_range[1])

if date_range is not None and query_builder is not None:
q = QueryBuilder()
query_builder = q.date_range(date_range).then(query_builder)

if row_range is not None and query_builder is not None:
q = QueryBuilder()
query_builder = q._row_range(row_range).then(query_builder)

version_query, read_options, read_query = self._get_queries(
as_of, date_range, row_range, columns, query_builder, **kwargs
)
Expand Down
17 changes: 16 additions & 1 deletion python/arcticdb/version_store/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,7 @@ def read(
symbol: str,
as_of: Optional[AsOf] = None,
date_range: Optional[Tuple[Optional[Timestamp], Optional[Timestamp]]] = None,
row_range: Optional[Tuple[int, int]] = None,
columns: Optional[List[str]] = None,
query_builder: Optional[QueryBuilder] = None,
) -> VersionedItem:
Expand Down Expand Up @@ -952,6 +953,15 @@ def read(
slower, but return data with a smaller memory footprint. See the QueryBuilder.date_range docstring for more
details.
Only one of date_range or row_range can be provided.
row_range: `Optional[Tuple[int, int]]`, default=None
Row range to read data for. Inclusive of the lower bound, exclusive of the upper bound
lib.read(symbol, row_range=(start, end)).data should behave the same as df.iloc[start:end], including in
the handling of negative start/end values.
Only one of date_range or row_range can be provided.
columns: List[str], default=None
Applicable only for Pandas data. Determines which columns to return data for.
Expand Down Expand Up @@ -983,7 +993,12 @@ def read(
2 7
"""
return self._nvs.read(
symbol=symbol, as_of=as_of, date_range=date_range, columns=columns, query_builder=query_builder
symbol=symbol,
as_of=as_of,
date_range=date_range,
row_range=row_range,
columns=columns,
query_builder=query_builder,
)

def read_batch(
Expand Down
17 changes: 15 additions & 2 deletions python/arcticdb/version_store/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,9 @@ def value_list_from_args(*args):
PythonProjectionClause = namedtuple("PythonProjectionClause", ["name", "expr"])
PythonGroupByClause = namedtuple("PythonGroupByClause", ["name"])
PythonAggregationClause = namedtuple("PythonAggregationClause", ["aggregations"])
PythonRowRangeClause = namedtuple("PythonRowRangeClause", ["row_range_type", "n"])
PythonRowRangeClause = namedtuple(
"PythonRowRangeClause", ["row_range_type", "n", "start", "end"], defaults=(None, None, None, None)
)
PythonDateRangeClause = namedtuple("PythonDateRangeClause", ["start", "end"])


Expand Down Expand Up @@ -519,9 +521,14 @@ def _head(self, n: int):
def _tail(self, n: int):
check(not len(self.clauses), "Tail only supported as first clause in the pipeline")
self.clauses.append(_RowRangeClause(_RowRangeType.TAIL, n))
self._python_clauses.append(PythonRowRangeClause(_RowRangeType.TAIL, n))
self._python_clauses.append(PythonRowRangeClause(row_range_type=_RowRangeType.TAIL, n=n))
return self

def _row_range(self, signed_row_range):
check(not len(self.clauses), "Row range only supported as first clause in the pipeline")
self.clauses.append(_RowRangeClause(_RowRangeType.RANGE, signed_row_range.start_, signed_row_range.end_))
self._python_clauses.append(PythonRowRangeClause(start=signed_row_range.start_, end=signed_row_range.end_))

def date_range(self, date_range: DateRangeInput):
"""
DateRange to read data for. Applicable only for Pandas data with a DateTime index. Returns only the part
Expand Down Expand Up @@ -551,6 +558,12 @@ def date_range(self, date_range: DateRangeInput):
self._python_clauses.append(PythonDateRangeClause(start.value, end.value))
return self

def _row_range(self, start: int, end: int):
check(not len(self.clauses), "Row range only supported as first clause in the pipeline")
self.clauses.append(_RowRangeClause(_RowRangeType.RANGE, start, end))
self._python_clauses.append(PythonRowRangeClause(start=start, end=end))
return self

def __eq__(self, right):
return self._optimisation == right._optimisation and self._python_clauses == right._python_clauses

Expand Down

0 comments on commit b83701b

Please sign in to comment.