Fixes for sort and finalize (#1763)

Fixes: #1737 #1736 #1735 <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>  --------- Co-authored-by: Vasil Pashov <vasil.pashov@man.com>
man-group · Sep 30, 2024 · a51ff8b · a51ff8b
1 parent 44a5a67
commit a51ff8b
Show file tree

Hide file tree

Showing 14 changed files with 317 additions and 45 deletions.
diff --git a/cpp/arcticdb/pipeline/frame_slice.hpp b/cpp/arcticdb/pipeline/frame_slice.hpp
@@ -83,7 +83,6 @@ struct FrameSlice {
  hash_bucket_(hash),
  num_buckets_(num_buckets),
  indices_(std::move(indices)) {
- util::check(col_range.diff() > 0 || row_range.diff() > 0, "Expected non-zero column or row range");
  }
 
  FrameSlice(const ColRange& col_range, const RowRange& row_range,
@@ -93,7 +92,6 @@ struct FrameSlice {
  row_range(row_range),
  hash_bucket_(hash_bucket),
  num_buckets_(num_buckets) {
- util::check(col_range.diff() > 0 || row_range.diff() > 0, "Expected non-zero column or row range");
  }
 
  explicit FrameSlice(const SegmentInMemory& seg);

diff --git a/cpp/arcticdb/pipeline/frame_utils.cpp b/cpp/arcticdb/pipeline/frame_utils.cpp
@@ -100,6 +100,9 @@ size_t adjust_slice_rowcounts(std::vector<pipelines::SliceAndKey> & slice_and_ke
 }
 
 size_t get_slice_rowcounts(std::vector<pipelines::SliceAndKey> & slice_and_keys) {
+ if (slice_and_keys.empty()) {
+ return 0;
+ }
  auto current_col = slice_and_keys[0].slice_.col_range.first;
  size_t rowcount = 0u;
  for (auto& slice_and_key : slice_and_keys) {

diff --git a/cpp/arcticdb/pipeline/index_utils.hpp b/cpp/arcticdb/pipeline/index_utils.hpp
@@ -36,21 +36,17 @@ inline std::vector<SliceAndKey> unfiltered_index(const index::IndexSegmentReader
 
 template<typename RowType>
 std::optional<IndexValue> index_value_from_row(const RowType &row, IndexDescriptorImpl::Type index_type, int field_num) {
- std::optional<IndexValue> index_value;
  switch (index_type) {
  case IndexDescriptorImpl::Type::TIMESTAMP:
- case IndexDescriptorImpl::Type::ROWCOUNT:
- index_value = row.template scalar_at<timestamp>(field_num);
- break;
- case IndexDescriptorImpl::Type::STRING: {
- auto opt = row.string_at(field_num);
- index_value = opt ? std::make_optional<IndexValue>(std::string(opt.value())) : std::nullopt;
- break;
- }
- default:
- util::raise_rte("Unknown index type {} for column {}", int(index_type), field_num);
+ case IndexDescriptorImpl::Type::ROWCOUNT: return row.template scalar_at<timestamp>(field_num);
+ case IndexDescriptorImpl::Type::STRING: {
+ auto opt = row.string_at(field_num);
+ return opt ? std::make_optional<IndexValue>(std::string(opt.value())) : std::nullopt;
  }
- return index_value;
+ default:
+ util::raise_rte("Unknown index type {} for column {}", int(index_type), field_num);
+ }
+ return std::nullopt;
 }
 
 template<typename RowType>

diff --git a/cpp/arcticdb/pipeline/read_pipeline.hpp b/cpp/arcticdb/pipeline/read_pipeline.hpp
@@ -173,7 +173,7 @@ inline std::optional<util::BitSet> overall_column_bitset(
 
 inline void generate_filtered_field_descriptors(PipelineContext& context, const std::optional<std::vector<std::string>>& columns) {
  if (columns.has_value()) {
- std::unordered_set<std::string_view> column_set{std::begin(*columns), std::end(*columns)};
+ const ankerl::unordered_dense::set<std::string_view> column_set{std::begin(*columns), std::end(*columns)};
 
  context.filter_columns_ = std::make_shared<FieldCollection>();
  const auto& desc = context.descriptor();

diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
@@ -34,7 +34,7 @@ std::vector<std::vector<size_t>> structure_by_row_slice(
  ranges_and_keys.erase(ranges_and_keys.begin(), ranges_and_keys.begin() + start_from);
 
  std::vector<std::vector<size_t>> res;
- RowRange previous_row_range;
+ RowRange previous_row_range{-1, -1};
  for (const auto& [idx, ranges_and_key]: folly::enumerate(ranges_and_keys)) {
  RowRange current_row_range{ranges_and_key.row_range_};
  if (current_row_range != previous_row_range) {
@@ -941,6 +941,11 @@ std::optional<std::vector<std::vector<EntityId>>> MergeClause::repartition(std::
  auto compare =
  [](const std::unique_ptr<SegmentWrapper> &left,
  const std::unique_ptr<SegmentWrapper> &right) {
+ if (left->seg_.row_count() == 0) {
+ return false;
+ } else if (right->seg_.row_count() == 0) {
+ return true;
+ }
  const auto left_index = index::index_value_from_row(left->row(), IndexDescriptorImpl::Type::TIMESTAMP, 0);
  const auto right_index = index::index_value_from_row(right->row(), IndexDescriptorImpl::Type::TIMESTAMP, 0);
  return left_index > right_index;

diff --git a/cpp/arcticdb/stream/append_map.cpp b/cpp/arcticdb/stream/append_map.cpp
@@ -147,17 +147,24 @@ SegmentInMemory incomplete_segment_from_frame(
  auto index_tensor = std::move(frame->index_tensor);
  const bool has_index = frame->has_index();
  const auto index = std::move(frame->index);
- SegmentInMemory output;
- auto field_tensors = std::move(frame->field_tensors);
 
- std::visit([&](const auto& idx) {
+ auto field_tensors = std::move(frame->field_tensors);
+ auto output = std::visit([&](const auto& idx) {
  using IdxType = std::decay_t<decltype(idx)>;
  using SingleSegmentAggregator = Aggregator<IdxType, FixedSchema, NeverSegmentPolicy>;
  auto copy_prev_key = prev_key;
  auto timeseries_desc = index_descriptor_from_frame(frame, existing_rows, std::move(prev_key));
  util::check(!timeseries_desc.fields().empty(), "Expected fields not to be empty in incomplete segment");
  auto norm_meta = timeseries_desc.proto().normalization();
  auto descriptor = timeseries_desc.as_stream_descriptor();
+
+ SegmentInMemory output;
+ if (num_rows == 0) {
+ output = SegmentInMemory(FixedSchema{descriptor, index}.default_descriptor(), 0, false, false);
+ output.set_timeseries_descriptor(pack_timeseries_descriptor(descriptor, existing_rows, std::move(copy_prev_key), std::move(norm_meta)));
+ return output;
+ }
+
  SingleSegmentAggregator agg{FixedSchema{descriptor, index}, [&](auto&& segment) {
  auto tsd = pack_timeseries_descriptor(descriptor, existing_rows + num_rows, std::move(copy_prev_key), std::move(norm_meta));
  segment.set_timeseries_descriptor(tsd);
@@ -194,7 +201,8 @@ SegmentInMemory incomplete_segment_from_frame(
 
  agg.end_block_write(num_rows);
  agg.commit();
- }, index);
+ return output;
+ }, index);
 
  ARCTICDB_DEBUG(log::version(), "Constructed segment from frame of {} rows and {} columns at offset {}", output.row_count(), output.num_columns(), output.offset());
  return output;
@@ -241,7 +249,6 @@ std::vector<SliceAndKey> get_incomplete(
  bool load_data) {
  using namespace arcticdb::pipelines;
 
- std::unique_ptr<TimeseriesDescriptor> unused;
  auto entries = get_incomplete_append_slices_for_stream_id(store, stream_id, via_iteration, load_data);
 
  util::variant_match(range,

diff --git a/cpp/arcticdb/stream/merge.hpp b/cpp/arcticdb/stream/merge.hpp
@@ -19,6 +19,9 @@ void do_merge(
  while (!input_streams.empty()) {
  auto next = input_streams.pop_top();
 
+ if (next->row().parent_->row_count() == 0) {
+ continue;
+ }
  agg.start_row(pipelines::index::index_value_from_row(next->row(), IndexDescriptorImpl::Type::TIMESTAMP, 0).value()) ([&next, add_symbol_column](auto &rb) {
  if(add_symbol_column)
  rb.set_scalar_by_name("symbol", std::string_view(std::get<StringId>(next->id())), DataType::UTF_DYNAMIC64);

diff --git a/cpp/arcticdb/util/error_code.hpp b/cpp/arcticdb/util/error_code.hpp
@@ -92,6 +92,7 @@ inline std::unordered_map<ErrorCategory, const char*> get_error_category_names()
  ERROR_CODE(7001, E_INVALID_DECIMAL_STRING) \
  ERROR_CODE(7002, E_INVALID_CHAR_IN_NAME) \
  ERROR_CODE(7003, E_NAME_TOO_LONG) \
+ ERROR_CODE(7004, E_NO_STAGED_SEGMENTS) \
  ERROR_CODE(8000, E_UNRECOGNISED_COLUMN_STATS_VERSION) \
  ERROR_CODE(9000, E_DECODE_ERROR) \
  ERROR_CODE(9001, E_UNKNOWN_CODEC) \

diff --git a/cpp/arcticdb/version/version_core-inl.hpp b/cpp/arcticdb/version/version_core-inl.hpp
@@ -127,12 +127,15 @@ void do_compact(
  };
 
  for(auto it = target_start; it != target_end; ++it) {
- decltype(auto) sk = [&it](){
+ auto sk = [&it](){
  if constexpr(std::is_same_v<IteratorType, pipelines::PipelineContext::iterator>)
  return it->slice_and_key();
  else
  return *it;
  }();
+ if (sk.slice().rows().diff() == 0) {
+ continue;
+ }
  aggregator.add_segment(
  std::move(sk.segment(store)),
  sk.slice(),

diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp
@@ -789,22 +789,32 @@ void read_incompletes_to_pipeline(
  if(incomplete_segments.empty())
  return;
 
+ // In order to have the right normalization metadata and descriptor we need to find the first non-empty segment.
+ // Picking an empty segment when there are non-empty ones will impact the index type and column namings.
+ // If all segments are empty we will procede as if were appending/writing and empty dataframe.
+ debug::check<ErrorCode::E_ASSERTION_FAILURE>(!incomplete_segments.empty(), "Incomplete segments must be non-empty");
+ const auto* seg = &incomplete_segments.front().segment(store);
+ for (auto& slice : incomplete_segments) {
+ if (slice.segment(store).row_count() > 0) {
+ seg = &slice.segment_.value();
+ break;
+ }
+ }
  // Mark the start point of the incompletes, so we know that there is no column slicing after this point
  pipeline_context->incompletes_after_ = pipeline_context->slice_and_keys_.size();
 
  // If there are only incompletes we need to add the index here
  if(pipeline_context->slice_and_keys_.empty()) {
- add_index_columns_to_query(read_query, incomplete_segments.begin()->segment(store).index_descriptor());
+ add_index_columns_to_query(read_query, seg->index_descriptor());
  }
 
- auto first_seg = incomplete_segments.begin()->segment(store);
+
  if (!pipeline_context->desc_)
- pipeline_context->desc_ = first_seg.descriptor();
+ pipeline_context->desc_ = seg->descriptor();
 
  if (!pipeline_context->norm_meta_) {
  pipeline_context->norm_meta_ = std::make_unique<arcticdb::proto::descriptors::NormalizationMetadata>();
- auto segment_tsd = first_seg.index_descriptor();
- pipeline_context->norm_meta_->CopyFrom(segment_tsd.proto().normalization());
+ pipeline_context->norm_meta_->CopyFrom(seg->index_descriptor().proto().normalization());
  ensure_timeseries_norm_meta(*pipeline_context->norm_meta_, pipeline_context->stream_id_, sparsify);
  }
 
@@ -847,11 +857,16 @@ void check_incompletes_index_ranges_dont_overlap(const std::shared_ptr<PipelineC
  // Use ordered set so we only need to compare adjacent elements
  std::set<TimestampRange> unique_timestamp_ranges;
  for (auto it = pipeline_context->incompletes_begin(); it!= pipeline_context->end(); it++) {
+ if (it->slice_and_key().slice().rows().diff() == 0) {
+ continue;
+ }
  sorting::check<ErrorCode::E_UNSORTED_DATA>(
  !last_existing_index_value.has_value() || it->slice_and_key().key().start_time() >= *last_existing_index_value,
  "Cannot append staged segments to existing data as incomplete segment contains index value < existing data (in UTC): {} <= {}",
  date_and_time(it->slice_and_key().key().start_time()),
- date_and_time(*last_existing_index_value));
+ // Should never reach "" but the standard mandates that all function arguments are evaluated
+ last_existing_index_value ? date_and_time(*last_existing_index_value) : ""
+ );
  auto [_, inserted] = unique_timestamp_ranges.insert({it->slice_and_key().key().start_time(), it->slice_and_key().key().end_time()});
  // This is correct because incomplete segments aren't column sliced
  sorting::check<ErrorCode::E_UNSORTED_DATA>(
@@ -1278,6 +1293,10 @@ VersionedItem sort_merge_impl(
  options.convert_int_to_float_,
  options.via_iteration_,
  options.sparsify_);
+ user_input::check<ErrorCode::E_NO_STAGED_SEGMENTS>(
+ pipeline_context->slice_and_keys_.size() > 0,
+ "Finalizing staged data is not allowed with empty staging area"
+ );
 
  std::vector<entity::VariantKey> delete_keys;
  for(auto sk = pipeline_context->incompletes_begin(); sk != pipeline_context->end(); ++sk) {
@@ -1299,6 +1318,15 @@ VersionedItem sort_merge_impl(
 
  read_query.clauses_.emplace_back(std::make_shared<Clause>(MergeClause{timeseries_index, SparseColumnPolicy{}, stream_id, pipeline_context->descriptor()}));
  auto segments = read_and_process(store, pipeline_context, read_query, ReadOptions{}, pipeline_context->incompletes_after());
+ if (options.append_ && update_info.previous_index_key_.has_value() &&
+ update_info.previous_index_key_->end_time() - 1 > std::get<timestamp>(TimeseriesIndex::start_value_for_segment(segments[0].segment_.value()))) {
+ store->remove_keys(delete_keys).get();
+ sorting::raise<ErrorCode::E_UNSORTED_DATA>(
+ "Cannot append staged segments to existing data as incomplete segment contains index value < existing data (in UTC): {} <= {}",
+ date_and_time(std::get<timestamp>(TimeseriesIndex::start_value_for_segment(segments[0].segment_.value()))),
+ date_and_time(update_info.previous_index_key_->end_time() - 1)
+ );
+ }
  pipeline_context->total_rows_ = num_versioned_rows + get_slice_rowcounts(segments);
 
  auto index = index_type_from_descriptor(pipeline_context->descriptor());
@@ -1375,9 +1403,10 @@ VersionedItem compact_incomplete_impl(
  options.convert_int_to_float_,
  options.via_iteration_,
  options.sparsify_);
- if (pipeline_context->slice_and_keys_.size() == prev_size) {
- util::raise_rte("No incomplete segments found for {}", stream_id);
- }
+ user_input::check<ErrorCode::E_NO_STAGED_SEGMENTS>(
+ pipeline_context->slice_and_keys_.size() != prev_size,
+ "Finalizing staged data is not allowed with empty staging area"
+ );
  if (options.validate_index_) {
  check_incompletes_index_ranges_dont_overlap(pipeline_context, previous_sorted_value);
  }

diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
@@ -1951,7 +1951,7 @@ def compact_incomplete(
  metadata: Optional[Any] = None,
  prune_previous_version: Optional[bool] = None,
  validate_index: bool = False,
- ):
+ ) -> VersionedItem:
  """
  Compact previously written un-indexed chunks of data, produced by a tick collector or parallel
  writes/appends.
@@ -1988,9 +1988,10 @@ def compact_incomplete(
  "prune_previous_version", self._write_options(), global_default=False, existing_value=prune_previous_version
  )
  udm = normalize_metadata(metadata) if metadata is not None else None
- return self.version_store.compact_incomplete(
+ vit = self.version_store.compact_incomplete(
  symbol, append, convert_int_to_float, via_iteration, sparsify, udm, prune_previous_version, validate_index
  )
+ return self._convert_thin_cxx_item_to_python(vit, metadata)
 
  @staticmethod
  def _get_index_columns_from_descriptor(descriptor):

diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
@@ -25,6 +25,7 @@
 import pandas as pd
 import numpy as np
 import logging
+from arcticdb.version_store._normalization import normalize_metadata
 
 logger = logging.getLogger(__name__)
 
@@ -902,7 +903,7 @@ def finalize_staged_data(
  prune_previous_versions: bool = False,
  metadata: Any = None,
  validate_index=True,
- ):
+ ) -> VersionedItem:
  """
  Finalizes staged data, making it available for reads.
 
@@ -924,12 +925,18 @@ def finalize_staged_data(
  are non-overlapping with each other, and, in the case of `StagedDataFinalizeMethod.APPEND`, fall after the
  last index value in the previous version.
 
+ Returns
+ -------
+ VersionedItem
+ Structure containing metadata and version number of the written symbol in the store.
+ The data member will be None.
+
  See Also
  --------
  write
  Documentation on the ``staged`` parameter explains the concept of staged data in more detail.
  """
- self._nvs.compact_incomplete(
+ return self._nvs.compact_incomplete(
  symbol,
  append=mode == StagedDataFinalizeMethod.APPEND,
  convert_int_to_float=False,
@@ -943,7 +950,8 @@ def sort_and_finalize_staged_data(
  symbol: str,
  mode: Optional[StagedDataFinalizeMethod] = StagedDataFinalizeMethod.WRITE,
  prune_previous_versions: bool = False,
- ):
+ metadata: Any = None
+ ) -> VersionedItem:
  """
  sort_merge will sort and finalize staged data. This differs from `finalize_staged_data` in that it
  can support staged segments with interleaved time periods - the end result will be ordered. This requires
@@ -961,18 +969,27 @@ def sort_and_finalize_staged_data(
  prune_previous_versions : bool, default=False
  Removes previous (non-snapshotted) versions from the database.
 
+ metadata : Any, default=None
+ Optional metadata to persist along with the symbol.
+
+ Returns
+ -------
+ VersionedItem
+ Structure containing metadata and version number of the written symbol in the store.
+ The data member will be None.
+
  See Also
  --------
  write
  Documentation on the ``staged`` parameter explains the concept of staged data in more detail.
  """
-
- self._nvs.version_store.sort_merge(
+ vit = self._nvs.version_store.sort_merge(
  symbol,
- None,
+ normalize_metadata(metadata) if metadata is not None else None,
  mode == StagedDataFinalizeMethod.APPEND,
  prune_previous_versions=prune_previous_versions,
  )
+ return self._nvs._convert_thin_cxx_item_to_python(vit, metadata)
 
  def get_staged_symbols(self) -> List[str]:
  """