feat: Allow row_range to be treated as a clause

Signed-off-by: Julien Jerphanion <git@jjerphan.xyz>
man-group · Sep 15, 2023 · b83701b · b83701b
1 parent 9cd92a2
commit b83701b
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 10 deletions.
diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
@@ -670,8 +670,10 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
  switch(row_range_type_) {
  case RowRangeType::HEAD:
  if (n_ >= 0) {
+ start_ = 0;
  end_ = std::min(n_, total_rows);
  } else {
+ start_ = 0;
  end_ = std::max(static_cast<int64_t>(0), total_rows + n_);
  }
  break;
@@ -684,6 +686,13 @@ void RowRangeClause::set_processing_config(const ProcessingConfig& processing_co
  end_ = total_rows;
  }
  break;
+ case RowRangeType::RANGE:
+ if (start_ > end_) {
+ internal::raise<ErrorCode::E_ASSERTION_FAILURE>("RowRangeClause start index {} is greater than end index {}", start_, end_);
+ }
+ n_ = end_ - start_;
+ break;
+
  default:
  internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Unrecognised RowRangeType {}", static_cast<uint8_t>(row_range_type_));
  }

diff --git a/cpp/arcticdb/processing/clause.hpp b/cpp/arcticdb/processing/clause.hpp
@@ -511,13 +511,14 @@ struct ColumnStatsGenerationClause {
 struct RowRangeClause {
  enum class RowRangeType: uint8_t {
  HEAD,
- TAIL
+ TAIL,
+ RANGE
  };
 
  ClauseInfo clause_info_;
  RowRangeType row_range_type_;
  // As passed into head or tail
- int64_t n_;
+ int64_t n_{0};
 
  // Row range to keep. Zero-indexed, inclusive of start, exclusive of end
  // Calculated from n, whether the RowRangeType is head or tail, and the total rows as passed in by set_processing_config
@@ -529,6 +530,12 @@ struct RowRangeClause {
  n_(n) {
  }
 
+ explicit RowRangeClause(int64_t start, int64_t end):
+ row_range_type_(RowRangeType::RANGE),
+ start_(start),
+ end_(end) {
+ }
+
  RowRangeClause() = delete;
 
  ARCTICDB_MOVE_COPY_DEFAULT(RowRangeClause)

diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp
@@ -218,9 +218,11 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
  .def_property_readonly("diff", &pipelines::RowRange::diff);
 
  py::class_<pipelines::SignedRowRange, std::shared_ptr<pipelines::SignedRowRange>>(version, "SignedRowRange")
- .def(py::init([](int64_t start, int64_t end){
- return SignedRowRange{start, end};
- }));
+ .def(py::init([](int64_t start, int64_t end){
+ return SignedRowRange{start, end};
+ }))
+ .def_readwrite("start_", &pipelines::SignedRowRange::start_)
+ .def_readwrite("end_", &pipelines::SignedRowRange::end_);
 
  py::class_<pipelines::ColRange, std::shared_ptr<pipelines::ColRange>>(version, "ColRange")
  .def_property_readonly("start", &pipelines::ColRange::start)
@@ -288,10 +290,12 @@ void register_bindings(py::module &version, py::exception<arcticdb::ArcticExcept
 
  py::enum_<RowRangeClause::RowRangeType>(version, "RowRangeType")
  .value("HEAD", RowRangeClause::RowRangeType::HEAD)
- .value("TAIL", RowRangeClause::RowRangeType::TAIL);
+ .value("TAIL", RowRangeClause::RowRangeType::TAIL)
+ .value("RANGE", RowRangeClause::RowRangeType::TAIL);
 
  py::class_<RowRangeClause, std::shared_ptr<RowRangeClause>>(version, "RowRangeClause")
  .def(py::init<RowRangeClause::RowRangeType, int64_t>())
+ .def(py::init<int64_t, int64_t>())
  .def("__str__", &RowRangeClause::to_string);
 
  py::class_<DateRangeClause, std::shared_ptr<DateRangeClause>>(version, "DateRangeClause")

diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
@@ -1592,10 +1592,12 @@ def read(
  of the data that falls within the given range. The same effect can be achieved by using the date_range
  clause of the QueryBuilder class, which will be slower, but return data with a smaller memory footprint.
  See the QueryBuilder.date_range docstring for more details.
+ Only one of date_range or row_range can be provided.
  row_range: `Optional[Tuple[int, int]]`, default=None
  Row range to read data for. Inclusive of the lower bound, exclusive of the upper bound
  lib.read(symbol, row_range=(start, end)).data should behave the same as df.iloc[start:end], including in
- the handling of negative start/end values. Only one of date_range or row_range can be provided.
+ the handling of negative start/end values.
+ Only one of date_range or row_range can be provided.
  columns: `Optional[List[str]]`, default=None
  Applicable only for Pandas data. Determines which columns to return data for.
  query_builder: 'Optional[QueryBuilder]', default=None
@@ -1610,9 +1612,15 @@ def read(
 
  if row_range is not None:
  row_range = _SignedRowRange(row_range[0], row_range[1])
+
  if date_range is not None and query_builder is not None:
  q = QueryBuilder()
  query_builder = q.date_range(date_range).then(query_builder)
+
+ if row_range is not None and query_builder is not None:
+ q = QueryBuilder()
+ query_builder = q._row_range(row_range).then(query_builder)
+
  version_query, read_options, read_query = self._get_queries(
  as_of, date_range, row_range, columns, query_builder, **kwargs
  )

diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
@@ -922,6 +922,7 @@ def read(
  symbol: str,
  as_of: Optional[AsOf] = None,
  date_range: Optional[Tuple[Optional[Timestamp], Optional[Timestamp]]] = None,
+ row_range: Optional[Tuple[int, int]] = None,
  columns: Optional[List[str]] = None,
  query_builder: Optional[QueryBuilder] = None,
  ) -> VersionedItem:
@@ -952,6 +953,15 @@ def read(
  slower, but return data with a smaller memory footprint. See the QueryBuilder.date_range docstring for more
  details.
 
+ Only one of date_range or row_range can be provided.
+
+ row_range: `Optional[Tuple[int, int]]`, default=None
+ Row range to read data for. Inclusive of the lower bound, exclusive of the upper bound
+ lib.read(symbol, row_range=(start, end)).data should behave the same as df.iloc[start:end], including in
+ the handling of negative start/end values.
+
+ Only one of date_range or row_range can be provided.
+
  columns: List[str], default=None
  Applicable only for Pandas data. Determines which columns to return data for.
 
@@ -983,7 +993,12 @@ def read(
  2 7
  """
  return self._nvs.read(
- symbol=symbol, as_of=as_of, date_range=date_range, columns=columns, query_builder=query_builder
+ symbol=symbol,
+ as_of=as_of,
+ date_range=date_range,
+ row_range=row_range,
+ columns=columns,
+ query_builder=query_builder,
  )
 
  def read_batch(

diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py
@@ -266,7 +266,9 @@ def value_list_from_args(*args):
 PythonProjectionClause = namedtuple("PythonProjectionClause", ["name", "expr"])
 PythonGroupByClause = namedtuple("PythonGroupByClause", ["name"])
 PythonAggregationClause = namedtuple("PythonAggregationClause", ["aggregations"])
-PythonRowRangeClause = namedtuple("PythonRowRangeClause", ["row_range_type", "n"])
+PythonRowRangeClause = namedtuple(
+ "PythonRowRangeClause", ["row_range_type", "n", "start", "end"], defaults=(None, None, None, None)
+)
 PythonDateRangeClause = namedtuple("PythonDateRangeClause", ["start", "end"])
 
 
@@ -519,9 +521,14 @@ def _head(self, n: int):
  def _tail(self, n: int):
  check(not len(self.clauses), "Tail only supported as first clause in the pipeline")
  self.clauses.append(_RowRangeClause(_RowRangeType.TAIL, n))
- self._python_clauses.append(PythonRowRangeClause(_RowRangeType.TAIL, n))
+ self._python_clauses.append(PythonRowRangeClause(row_range_type=_RowRangeType.TAIL, n=n))
  return self
 
+ def _row_range(self, signed_row_range):
+ check(not len(self.clauses), "Row range only supported as first clause in the pipeline")
+ self.clauses.append(_RowRangeClause(_RowRangeType.RANGE, signed_row_range.start_, signed_row_range.end_))
+ self._python_clauses.append(PythonRowRangeClause(start=signed_row_range.start_, end=signed_row_range.end_))
+
  def date_range(self, date_range: DateRangeInput):
  """
  DateRange to read data for. Applicable only for Pandas data with a DateTime index. Returns only the part
@@ -551,6 +558,12 @@ def date_range(self, date_range: DateRangeInput):
  self._python_clauses.append(PythonDateRangeClause(start.value, end.value))
  return self
 
+ def _row_range(self, start: int, end: int):
+ check(not len(self.clauses), "Row range only supported as first clause in the pipeline")
+ self.clauses.append(_RowRangeClause(_RowRangeType.RANGE, start, end))
+ self._python_clauses.append(PythonRowRangeClause(start=start, end=end))
+ return self
+
  def __eq__(self, right):
  return self._optimisation == right._optimisation and self._python_clauses == right._python_clauses