From e8e5faeab39d35bae3745e036c9d1d75143b2bbf Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 10 Jan 2024 13:05:14 +0200 Subject: [PATCH 01/30] Add src and dest type descriptors to decode or expand. Add backfilling. --- cpp/arcticdb/entity/types.hpp | 15 +++- cpp/arcticdb/pipeline/read_frame.cpp | 78 +++++++++++++++---- cpp/arcticdb/python/python_handlers.cpp | 46 +++++++---- cpp/arcticdb/python/python_handlers.hpp | 12 ++- cpp/arcticdb/util/type_handler.hpp | 18 ++++- .../version_store/test_empty_column_type.py | 26 +++++-- 6 files changed, 150 insertions(+), 45 deletions(-) diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index 876bd69947..1e4b98705a 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -508,6 +508,10 @@ struct TypeDescriptor { data_type_ = combine_data_type(slice_value_type(data_type_), new_size_bits); } + [[nodiscard]] SizeBits get_size_bits() const { + return slice_bit_size(data_type_); + } + [[nodiscard]] constexpr int get_type_byte_size() const { return get_byte_count(slice_bit_size(data_type_)); } @@ -529,11 +533,16 @@ constexpr bool is_numpy_array(TypeDescriptor td) { } constexpr bool is_pyobject_type(TypeDescriptor td) { - return is_dynamic_string_type(slice_value_type(td.data_type())) || - slice_value_type(td.data_type()) == ValueType::PYBOOL || - is_numpy_array(td); + return is_dynamic_string_type(slice_value_type(td.data_type())) || is_py_bool_type(td.data_type()) || + is_numpy_array(td); } +constexpr inline bool is_nullable_type(TypeDescriptor td) { + return td.dimension() > Dimension::Dim0 || + is_empty_type(td.data_type()) || + is_sequence_type(td.data_type()) || + is_py_bool_type(td.data_type()); +} inline void set_data_type(DataType data_type, TypeDescriptor& type_desc) { type_desc.data_type_ = data_type; diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index 7077941d0c..5432de6215 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -225,12 +225,22 @@ void decode_or_expand_impl( const uint8_t*& data, uint8_t* dest, const EncodedFieldType& encoded_field_info, - const TypeDescriptor& type_descriptor, + const TypeDescriptor& source_type_descriptor, + const TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encding_version) { - if(auto handler = TypeHandlerRegistry::instance()->get_handler(type_descriptor); handler) { - handler->handle_type(data, dest, VariantField{&encoded_field_info}, type_descriptor, dest_bytes, std::move(buffers), encding_version); + if(auto handler = TypeHandlerRegistry::instance()->get_handler(source_type_descriptor); handler) { + handler->handle_type( + data, + dest, + VariantField{&encoded_field_info}, + source_type_descriptor, + destination_type_descriptor, + dest_bytes, + std::move(buffers), + encding_version + ); } else { std::optional bv; if (encoded_field_info.has_ndarray() && encoded_field_info.ndarray().sparse_map_bytes() > 0) { @@ -238,8 +248,8 @@ void decode_or_expand_impl( const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); ChunkedBuffer sparse{bytes}; SliceDataSink sparse_sink{sparse.data(), bytes}; - data += decode_field(type_descriptor, encoded_field_info, data, sparse_sink, bv, encding_version); - type_descriptor.visit_tag([dest, dest_bytes, &bv, &sparse](const auto tdt) { + data += decode_field(source_type_descriptor, encoded_field_info, data, sparse_sink, bv, encding_version); + source_type_descriptor.visit_tag([dest, dest_bytes, &bv, &sparse](const auto tdt) { using TagType = decltype(tdt); using RawType = typename TagType::DataTypeTag::raw_type; util::default_initialize(dest, dest_bytes); @@ -249,12 +259,12 @@ void decode_or_expand_impl( SliceDataSink sink(dest, dest_bytes); const auto &ndarray = encoded_field_info.ndarray(); if (const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); bytes < dest_bytes) { - type_descriptor.visit_tag([dest, bytes, dest_bytes](const auto tdt) { + source_type_descriptor.visit_tag([dest, bytes, dest_bytes](const auto tdt) { using TagType = decltype(tdt); util::default_initialize(dest + bytes, dest_bytes - bytes); }); } - data += decode_field(type_descriptor, encoded_field_info, data, sink, bv, encding_version); + data += decode_field(source_type_descriptor, encoded_field_info, data, sink, bv, encding_version); } } } @@ -279,13 +289,23 @@ void decode_or_expand( const uint8_t*& data, uint8_t* dest, const VariantField& variant_field, - const TypeDescriptor& type_descriptor, + const TypeDescriptor& source_type_descriptor, + const TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encoding_version ) { util::variant_match(variant_field, [&](auto field) { - decode_or_expand_impl(data, dest, *field, type_descriptor, dest_bytes, buffers, encoding_version); + decode_or_expand_impl( + data, + dest, + *field, + source_type_descriptor, + destination_type_descriptor, + dest_bytes, + buffers, + encoding_version + ); }); } @@ -369,7 +389,9 @@ void decode_into_frame_static( auto field_name = context.descriptor().fields(it.source_field_pos()).name(); auto& buffer = frame.column(static_cast(it.dest_col())).data().buffer(); ColumnMapping m{frame, it.dest_col(), it.source_field_pos(), context}; - util::check(trivially_compatible_types(m.source_type_desc_, m.dest_type_desc_), "Column type conversion from {} to {} not implemented in column {}:{} -> {}:{}", + const bool types_trivially_compatible = trivially_compatible_types(m.source_type_desc_, m.dest_type_desc_); + const bool any_type_is_empty = is_empty_type(m.source_type_desc_.data_type()) || is_empty_type(m.dest_type_desc_.data_type()); + util::check(types_trivially_compatible || any_type_is_empty, "Column type conversion from {} to {} not implemented in column {}:{} -> {}:{}", m.source_type_desc_, m.dest_type_desc_, it.source_col(), @@ -377,7 +399,16 @@ void decode_into_frame_static( it.dest_col(), m.frame_field_descriptor_.name()); util::check(data != end || remaining_fields_empty(it, context), "Reached end of input block with {} fields to decode", it.remaining_fields()); - decode_or_expand(data, buffer.data() + m.offset_bytes_, encoded_field, m.source_type_desc_, m.dest_bytes_, buffers, encoding_version); + decode_or_expand( + data, + buffer.data() + m.offset_bytes_, + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + m.dest_bytes_, + buffers, + encoding_version + ); ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", field_name, data - begin); it.advance(); @@ -449,14 +480,22 @@ void decode_into_frame_dynamic( if constexpr(std::is_arithmetic_v && std::is_arithmetic_v) { const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; Buffer tmp_buf{src_bytes}; - decode_or_expand(data, tmp_buf.data(), encoded_field, m.source_type_desc_, src_bytes, buffers, encdoing_version); + decode_or_expand( + data, + tmp_buf.data(), + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + src_bytes, + buffers, + encdoing_version + ); auto src_ptr = reinterpret_cast(tmp_buf.data()); auto dest_ptr = reinterpret_cast(buffer.data() + m.offset_bytes_); for (auto i = 0u; i < m.num_rows_; ++i) { *dest_ptr++ = static_cast(*src_ptr++); } - } - else { + } else { util::raise_rte("Can't promote type {} to type {} in field {}", m.source_type_desc_, m.dest_type_desc_, m.frame_field_descriptor_.name()); } }); @@ -469,7 +508,16 @@ void decode_into_frame_dynamic( "Reached end of input block with {} fields to decode", field_count - field_col); - decode_or_expand(data, buffer.data() + m.offset_bytes_, encoded_field, m.source_type_desc_, m.dest_bytes_, buffers, encdoing_version); + decode_or_expand( + data, + buffer.data() + m.offset_bytes_, + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + m.dest_bytes_, + buffers, + encdoing_version + ); } ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", frame.field(dst_col).name(), data - begin); } diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index cadf3e58ef..c0e3abbbd0 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -26,8 +26,8 @@ namespace arcticdb { } /// @important This calls pybind's initialize array function which is NOT thread safe. Moreover, numpy arrays can - /// be created only by the thread holindg the GIL. In practice we can get away with allocating arrays only from - /// a sigle thread (enve if it's not the one holding the GIL). This, however, is not guranteed to work. + /// be created only by the thread holding the GIL. In practice we can get away with allocating arrays only from + /// a single thread (even if it's not the one holding the GIL). This, however, is not guaranteed to work. /// @todo Allocate numpy arrays only from the thread holding the GIL [[nodiscard]] static inline PyObject* initialize_array( const pybind11::dtype& descr, @@ -57,7 +57,8 @@ namespace arcticdb { const uint8_t*& input, uint8_t* dest, const VariantField& variant_field, - const entity::TypeDescriptor&, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr, EncodingVersion @@ -66,8 +67,25 @@ namespace arcticdb { util::check(dest != nullptr, "Got null destination pointer"); const size_t num_rows = dest_bytes / get_type_size(DataType::EMPTYVAL); static_assert(get_type_size(DataType::EMPTYVAL) == sizeof(PyObject *)); - auto ptr_dest = reinterpret_cast(dest); - fill_with_none(ptr_dest, num_rows); + + const SizeBits destination_size_bits = destination_type_descriptor.get_size_bits(); + destination_type_descriptor.visit_tag([&](auto tag) { + if constexpr (is_nullable_type(TypeDescriptor(tag))) { + auto ptr_dest = reinterpret_cast(dest); + fill_with_none(ptr_dest, num_rows); + } else if constexpr (is_floating_point_type(tag.data_type())) { + using RawType = typename decltype(tag)::DataTypeTag::raw_type; + RawType* ptr_dest = reinterpret_cast(dest); + std::generate(ptr_dest, ptr_dest + num_rows, &std::numeric_limits::quiet_NaN); + dest += num_rows * sizeof(RawType); + } else if constexpr (is_integer_type(tag.data_type()) || is_bool_type(tag.data_type()) || is_time_type(tag.data_type())) { + using RawType = typename decltype(tag)::DataTypeTag::raw_type; + std::memset(dest, 0, sizeof(RawType) * num_rows); + dest += num_rows * sizeof(RawType); + } else { + static_assert(sizeof(tag) == 0, "Unhandled data_type"); + } + }); util::variant_match(variant_field, [&input](const auto &field) { using EncodedFieldType = std::decay_t; @@ -92,7 +110,8 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field_info, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr, EncodingVersion encding_version @@ -116,7 +135,7 @@ namespace arcticdb { const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); auto sparse = ChunkedBuffer::presized(bytes); SliceDataSink sparse_sink{sparse.data(), bytes}; - data += decode_field(type_descriptor, *field, data, sparse_sink, bv, encding_version); + data += decode_field(source_type_descriptor, *field, data, sparse_sink, bv, encding_version); const auto num_bools = bv->count(); auto ptr_src = sparse.template ptr_cast(0, num_bools * sizeof(uint8_t)); auto last_row = 0u; @@ -142,7 +161,8 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field_info, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encoding_version @@ -158,17 +178,17 @@ namespace arcticdb { fill_with_none(ptr_dest, row_count); return; } - std::shared_ptr column = buffers->get_buffer(type_descriptor, true); + std::shared_ptr column = buffers->get_buffer(source_type_descriptor, true); column->check_magic(); log::version().info("Column got buffer at {}", uintptr_t(column.get())); auto bv = std::make_optional(util::BitSet{}); - data += decode_field(type_descriptor, *field, data, *column, bv, encoding_version); + data += decode_field(source_type_descriptor, *field, data, *column, bv, encoding_version); auto last_row = 0u; ARCTICDB_SUBSAMPLE(InitArrayAcquireGIL, 0) - const auto strides = static_cast(get_type_size(type_descriptor.data_type())); - const py::dtype py_dtype = generate_python_dtype(type_descriptor, strides); - type_descriptor.visit_tag([&] (auto tdt) { + const auto strides = static_cast(get_type_size(source_type_descriptor.data_type())); + const py::dtype py_dtype = generate_python_dtype(source_type_descriptor, strides); + source_type_descriptor.visit_tag([&] (auto tdt) { const auto& blocks = column->blocks(); if(blocks.empty()) return; diff --git a/cpp/arcticdb/python/python_handlers.hpp b/cpp/arcticdb/python/python_handlers.hpp index f6f750791b..4c501bee5c 100644 --- a/cpp/arcticdb/python/python_handlers.hpp +++ b/cpp/arcticdb/python/python_handlers.hpp @@ -17,7 +17,8 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encding_version); @@ -29,7 +30,8 @@ namespace arcticdb { const uint8_t *&data, uint8_t *dest, const VariantField &encoded_field, - const entity::TypeDescriptor &type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encding_version); @@ -40,7 +42,8 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers ); @@ -55,7 +58,8 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, EncodingVersion encding_version); diff --git a/cpp/arcticdb/util/type_handler.hpp b/cpp/arcticdb/util/type_handler.hpp index a696b0c9ec..aa21f0e14f 100644 --- a/cpp/arcticdb/util/type_handler.hpp +++ b/cpp/arcticdb/util/type_handler.hpp @@ -32,11 +32,23 @@ struct ITypeHandler { const uint8_t*& source, uint8_t* dest, const VariantField& encoded_field_info, - const entity::TypeDescriptor& type_descriptor, + const entity::TypeDescriptor& source_type_descriptor, + const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encoding_version) { - folly::poly_call<0>(*this, source, dest, encoded_field_info, type_descriptor, dest_bytes, buffers, encoding_version); + EncodingVersion encoding_version + ) { + folly::poly_call<0>( + *this, + source, + dest, + encoded_field_info, + source_type_descriptor, + destination_type_descriptor, + dest_bytes, + buffers, + encoding_version + ); } }; diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 1dc64f9881..7f706da764 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -10,21 +10,32 @@ import numpy as np import pytest -@pytest.fixture(params=(int, float, str)) -def element_type_constructor(request): - yield request.param +@pytest.fixture +def int_dtype(): + bit_sizes = ["8", "16", "32", "64"] + base_numeric_types = ["int", "uint"] + for t in base_numeric_types: + for s in bit_sizes: + yield t + s -@pytest.mark.xfail(reason="Not implemented") +@pytest.fixture +def float_dtype(): + bit_sizes = ["32", "64"] + for s in bit_sizes: + yield "float" + s + class TestEmptyColumnTypeAppend: - def test_can_append_non_empty_to_empty(self, lmdb_version_store, element_type_constructor): + def test_can_append_non_empty_to_empty(self, lmdb_version_store_v2, int_dtype): + lmdb_version_store = lmdb_version_store_v2 df_empty = pd.DataFrame({"col1": [None, None]}) lmdb_version_store.write("test_can_append_to_empty_column", df_empty) - df_non_empty = pd.DataFrame({"col1": map(element_type_constructor, [1, 2, 3])}) + df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) lmdb_version_store.append("test_can_append_to_empty_column", df_non_empty) - expected_result = pd.concat([df_empty, df_non_empty], ignore_index=True) + expected_result = pd.DataFrame({"col1": np.array([0,0,1,2,3], dtype=int_dtype)}) assert_frame_equal(lmdb_version_store.read("test_can_append_to_empty_column").data, expected_result) + @pytest.mark.xfail(reason="Not implemented") def test_can_append_empty_to_non_empty(self, lmdb_version_store, element_type_constructor): df_empty = pd.DataFrame({"col1": [None, None]}) df_non_empty = pd.DataFrame({"col1": map(element_type_constructor, [1, 2, 3])}) @@ -33,6 +44,7 @@ def test_can_append_empty_to_non_empty(self, lmdb_version_store, element_type_co expected_result = pd.concat([df_non_empty, df_empty], ignore_index=True) assert_frame_equal(lmdb_version_store.read("test_can_append_to_empty_column").data, expected_result) + @pytest.mark.xfail(reason="Not implemented") def test_can_append_empty_to_empty(self, lmdb_version_store, element_type_constructor): df_empty_1 = pd.DataFrame({"col1": [None, None]}) df_empty_2 = pd.DataFrame({"col1": []}) From 0ab4c22896842c8806cfcd652b31b279f8d88093 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 11 Jan 2024 14:23:50 +0200 Subject: [PATCH 02/30] Fix string backfills in emoty type. Add test cases for all types to check if they can be appened to null --- cpp/arcticdb/entity/types.hpp | 7 -- cpp/arcticdb/pipeline/read_frame.cpp | 81 ++++++++--------- cpp/arcticdb/python/python_handlers.cpp | 42 +++++---- cpp/arcticdb/util/sparse_utils.hpp | 3 + python/test.py | 7 ++ .../version_store/test_empty_column_type.py | 91 +++++++++++-------- 6 files changed, 130 insertions(+), 101 deletions(-) create mode 100644 python/test.py diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index 1e4b98705a..771a03d083 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -537,13 +537,6 @@ constexpr bool is_pyobject_type(TypeDescriptor td) { is_numpy_array(td); } -constexpr inline bool is_nullable_type(TypeDescriptor td) { - return td.dimension() > Dimension::Dim0 || - is_empty_type(td.data_type()) || - is_sequence_type(td.data_type()) || - is_py_bool_type(td.data_type()); -} - inline void set_data_type(DataType data_type, TypeDescriptor& type_desc) { type_desc.data_type_ = data_type; } diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index 5432de6215..e8f2e90061 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -296,16 +296,16 @@ void decode_or_expand( EncodingVersion encoding_version ) { util::variant_match(variant_field, [&](auto field) { - decode_or_expand_impl( - data, - dest, - *field, - source_type_descriptor, - destination_type_descriptor, - dest_bytes, - buffers, - encoding_version - ); + decode_or_expand_impl( + data, + dest, + *field, + source_type_descriptor, + destination_type_descriptor, + dest_bytes, + buffers, + encoding_version + ); }); } @@ -399,16 +399,16 @@ void decode_into_frame_static( it.dest_col(), m.frame_field_descriptor_.name()); util::check(data != end || remaining_fields_empty(it, context), "Reached end of input block with {} fields to decode", it.remaining_fields()); - decode_or_expand( - data, - buffer.data() + m.offset_bytes_, - encoded_field, - m.source_type_desc_, - m.dest_type_desc_, - m.dest_bytes_, - buffers, - encoding_version - ); + decode_or_expand( + data, + buffer.data() + m.offset_bytes_, + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + m.dest_bytes_, + buffers, + encoding_version + ); ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", field_name, data - begin); it.advance(); @@ -477,19 +477,19 @@ void decode_into_frame_dynamic( using DestinationType = typename decltype(dest_desc_tag)::DataTypeTag::raw_type; m.source_type_desc_.visit_tag([&buffer, &m, &data, &encoded_field, &buffers, encdoing_version] (auto src_desc_tag ) { using SourceType = typename decltype(src_desc_tag)::DataTypeTag::raw_type; - if constexpr(std::is_arithmetic_v && std::is_arithmetic_v) { + if constexpr (std::is_arithmetic_v && std::is_arithmetic_v) { const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; Buffer tmp_buf{src_bytes}; - decode_or_expand( - data, - tmp_buf.data(), - encoded_field, - m.source_type_desc_, - m.dest_type_desc_, - src_bytes, - buffers, - encdoing_version - ); + decode_or_expand( + data, + tmp_buf.data(), + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + src_bytes, + buffers, + encdoing_version + ); auto src_ptr = reinterpret_cast(tmp_buf.data()); auto dest_ptr = reinterpret_cast(buffer.data() + m.offset_bytes_); for (auto i = 0u; i < m.num_rows_; ++i) { @@ -507,17 +507,16 @@ void decode_into_frame_dynamic( util::check(data != end, "Reached end of input block with {} fields to decode", field_count - field_col); - decode_or_expand( - data, - buffer.data() + m.offset_bytes_, - encoded_field, - m.source_type_desc_, - m.dest_type_desc_, - m.dest_bytes_, - buffers, - encdoing_version - ); + data, + buffer.data() + m.offset_bytes_, + encoded_field, + m.source_type_desc_, + m.dest_type_desc_, + m.dest_bytes_, + buffers, + encdoing_version + ); } ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", frame.field(dst_col).name(), data - begin); } diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index c0e3abbbd0..b549083444 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -53,6 +53,10 @@ namespace arcticdb { return dest + count; } + constexpr inline bool is_nullable_type(TypeDescriptor td) { + return td.dimension() > Dimension::Dim0 || is_empty_type(td.data_type()) || is_py_bool_type(td.data_type()); + } + void EmptyHandler::handle_type( const uint8_t*& input, uint8_t* dest, @@ -65,39 +69,45 @@ namespace arcticdb { ) { ARCTICDB_SAMPLE(HandleEmpty, 0) util::check(dest != nullptr, "Got null destination pointer"); - const size_t num_rows = dest_bytes / get_type_size(DataType::EMPTYVAL); - static_assert(get_type_size(DataType::EMPTYVAL) == sizeof(PyObject *)); + const size_t num_rows = dest_bytes / destination_type_descriptor.get_type_byte_size(); + static_assert(get_type_size(DataType::EMPTYVAL) == sizeof(PyObject*)); - const SizeBits destination_size_bits = destination_type_descriptor.get_size_bits(); + // TODO: arcticdb::util::default_initialize does mostly the same thing + // the main difference is that it does not write py::none directly + // * Should use that? + // * It doesn't handle nullable bools and arrays? How should they be handled? destination_type_descriptor.visit_tag([&](auto tag) { - if constexpr (is_nullable_type(TypeDescriptor(tag))) { - auto ptr_dest = reinterpret_cast(dest); - fill_with_none(ptr_dest, num_rows); + using RawType = typename decltype(tag)::DataTypeTag::raw_type; + RawType* ptr_dest = reinterpret_cast(dest); + if constexpr (is_sequence_type(tag.data_type())) { + // not_a_string() is set here as strings are offset in the string pool + // later in fix_and_reduce() the offsets are replaced by a real python strings + // TODO: It would be best if the type handler sets this to py::none + std::fill_n(ptr_dest, num_rows, arcticdb::not_a_string()); + } else if constexpr (is_nullable_type(TypeDescriptor(tag))) { + fill_with_none(reinterpret_cast(dest), num_rows); } else if constexpr (is_floating_point_type(tag.data_type())) { - using RawType = typename decltype(tag)::DataTypeTag::raw_type; - RawType* ptr_dest = reinterpret_cast(dest); - std::generate(ptr_dest, ptr_dest + num_rows, &std::numeric_limits::quiet_NaN); - dest += num_rows * sizeof(RawType); - } else if constexpr (is_integer_type(tag.data_type()) || is_bool_type(tag.data_type()) || is_time_type(tag.data_type())) { - using RawType = typename decltype(tag)::DataTypeTag::raw_type; + std::fill_n(ptr_dest, num_rows, std::numeric_limits::quiet_NaN()); + } else if constexpr (is_integer_type(tag.data_type()) || is_bool_type(tag.data_type())) { std::memset(dest, 0, sizeof(RawType) * num_rows); - dest += num_rows * sizeof(RawType); + } else if constexpr (is_time_type(tag.data_type())) { + std::fill_n(ptr_dest, num_rows, arcticdb::util::NaT); } else { static_assert(sizeof(tag) == 0, "Unhandled data_type"); } }); - util::variant_match(variant_field, [&input](const auto &field) { + util::variant_match(variant_field, [&input](const auto& field) { using EncodedFieldType = std::decay_t; if constexpr (std::is_same_v) util::check_magic(input); if (field->encoding_case() == EncodedFieldType::kNdarray) { - const auto &ndarray_field = field->ndarray(); + const auto& ndarray_field = field->ndarray(); const auto num_blocks = ndarray_field.values_size(); util::check(num_blocks <= 1, "Unexpected number of empty type blocks: {}", num_blocks); for (auto block_num = 0; block_num < num_blocks; ++block_num) { - const auto &block_info = ndarray_field.values(block_num); + const auto& block_info = ndarray_field.values(block_num); input += block_info.out_bytes(); } } else { diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp index e24c6cad59..6eb892c1b3 100644 --- a/cpp/arcticdb/util/sparse_utils.hpp +++ b/cpp/arcticdb/util/sparse_utils.hpp @@ -68,6 +68,9 @@ inline void expand_dense_buffer_using_bitmap(const util::BitMagic &bv, const uin } } +// TODO: EmptyHandler::handle_type has similar procedure. +// * Should this be used instead? +// * What about nullable booleans and arrays? template void default_initialize(uint8_t* data, size_t bytes) { using RawType = typename TagType::DataTypeTag::raw_type; diff --git a/python/test.py b/python/test.py new file mode 100644 index 0000000000..29127d28a3 --- /dev/null +++ b/python/test.py @@ -0,0 +1,7 @@ +from arcticdb import Arctic +import pandas as pd +import numpy as np +ac = Arctic("lmdb://test") +l = ac.get_library("test", create_if_missing=True) +df = pd.DataFrame({"col": [1,2,3]}) +l.write("test", df) \ No newline at end of file diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 7f706da764..20e7969b53 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -5,52 +5,69 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +from math import nan import pandas as pd from pandas.testing import assert_frame_equal import numpy as np import pytest -@pytest.fixture -def int_dtype(): - bit_sizes = ["8", "16", "32", "64"] - base_numeric_types = ["int", "uint"] - for t in base_numeric_types: - for s in bit_sizes: - yield t + s +@pytest.fixture(params=[t + s for s in ["8", "16", "32", "64"] for t in ["int", "uint"]]) +def int_dtype(request): + yield request.param -@pytest.fixture -def float_dtype(): - bit_sizes = ["32", "64"] - for s in bit_sizes: - yield "float" + s +@pytest.fixture(params=["float" + s for s in ["32", "64"]]) +def float_dtype(request): + yield request.param -class TestEmptyColumnTypeAppend: - def test_can_append_non_empty_to_empty(self, lmdb_version_store_v2, int_dtype): - lmdb_version_store = lmdb_version_store_v2 + +# object is for nullable boolean +@pytest.fixture(params=["bool", "object"]) +def boolean_dtype(request): + yield request.param + + +class TestCanAppendToEmptyColumn: + + @pytest.fixture(autouse=True) + def create_empty_column(self, lmdb_version_store): df_empty = pd.DataFrame({"col1": [None, None]}) - lmdb_version_store.write("test_can_append_to_empty_column", df_empty) + lmdb_version_store.write("sym", df_empty) + yield + + def test_integer(self, lmdb_version_store, int_dtype): df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) - lmdb_version_store.append("test_can_append_to_empty_column", df_non_empty) + lmdb_version_store.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([0,0,1,2,3], dtype=int_dtype)}) - assert_frame_equal(lmdb_version_store.read("test_can_append_to_empty_column").data, expected_result) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + + def test_float(self, lmdb_version_store, float_dtype): + df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=float_dtype)}) + lmdb_version_store.append("sym", df_non_empty) + expected_result = pd.DataFrame({"col1": np.array([float("NaN"),float("NaN"),1,2,3], dtype=float_dtype)}) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + + def test_bool(self, lmdb_version_store, boolean_dtype): + df_non_empty = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) + lmdb_version_store.append("sym", df_non_empty) + expected_result = pd.DataFrame({"col1": np.array([None, None, True, False, True], dtype=boolean_dtype)}) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + + def test_empty(self, lmdb_version_store): + df_non_empty = pd.DataFrame({"col1": np.array([None, None, None])}) + lmdb_version_store.append("sym", df_non_empty) + expected_result = pd.DataFrame({"col1": np.array([None, None, None, None, None])}) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + + def test_string(self, lmdb_version_store): + df_non_empty = pd.DataFrame({"col1": np.array(["some_string", "long_string"*100])}) + lmdb_version_store.append("sym", df_non_empty) + expected_result = pd.DataFrame({"col1": np.array([None, None, "some_string", "long_string"*100])}) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + + def test_date(self, lmdb_version_store): + df_non_empty = pd.DataFrame({"col1": np.array([np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')])}) + lmdb_version_store.append("sym", df_non_empty) + expected_result = pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT'), np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')], dtype="datetime64[ns]")}) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - @pytest.mark.xfail(reason="Not implemented") - def test_can_append_empty_to_non_empty(self, lmdb_version_store, element_type_constructor): - df_empty = pd.DataFrame({"col1": [None, None]}) - df_non_empty = pd.DataFrame({"col1": map(element_type_constructor, [1, 2, 3])}) - lmdb_version_store.write("test_can_append_to_empty_column", df_non_empty) - lmdb_version_store.append("test_can_append_to_empty_column", df_empty) - expected_result = pd.concat([df_non_empty, df_empty], ignore_index=True) - assert_frame_equal(lmdb_version_store.read("test_can_append_to_empty_column").data, expected_result) - - @pytest.mark.xfail(reason="Not implemented") - def test_can_append_empty_to_empty(self, lmdb_version_store, element_type_constructor): - df_empty_1 = pd.DataFrame({"col1": [None, None]}) - df_empty_2 = pd.DataFrame({"col1": []}) - df_empty_3 = pd.DataFrame({"col1": [None]}) - lmdb_version_store.write("test_can_append_to_empty_column", df_empty_1) - lmdb_version_store.append("test_can_append_to_empty_column", df_empty_2) - lmdb_version_store.append("test_can_append_to_empty_column", df_empty_3) - expected_result = pd.concat([df_empty_1, df_empty_2, df_empty_3], ignore_index=True) - assert_frame_equal(lmdb_version_store.read("test_can_append_to_empty_column").data, expected_result) From d350261e4ceaa515d3d687e56877d851b5e434cf Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 15 Jan 2024 10:36:17 +0200 Subject: [PATCH 03/30] Add a test for appending none to int --- .../version_store/test_empty_column_type.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 20e7969b53..bd9b03e45a 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -71,3 +71,16 @@ def test_date(self, lmdb_version_store): expected_result = pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT'), np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')], dtype="datetime64[ns]")}) assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + +class TestCanAppendEmptyToColumn: + + def test_integer(self, lmdb_version_store_v2): + int_dtype = "int32" + lmdb_version_store = lmdb_version_store_v2 + df = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) + lmdb_version_store.write("sym", df) + lmdb_version_store.append("sym", pd.DataFrame({"col1": [None, None]})) + lmdb_version_store.append("sym", pd.DataFrame({"col1": [None, None]})) + expected_result = pd.DataFrame({"col1": np.array([1,2,3,0,0,0,0], dtype=int_dtype)}) + print(lmdb_version_store.read("sym", row_range=[5,7]).data) + assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) \ No newline at end of file From 8eb7253e2f023d50ced6df51baf4f88351037ba3 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 22 Jan 2024 18:28:39 +0200 Subject: [PATCH 04/30] Another batch of fixes for the empty data type - Remove PYBOOL64 - Fix issues regarding appending empty to strings and strings to empty with dynamic schema. - Add tests for appending empty to all other types and all other types to empty --- cpp/arcticdb/CMakeLists.txt | 4 + cpp/arcticdb/column_store/column.cpp | 21 +- cpp/arcticdb/column_store/column.hpp | 2 +- cpp/arcticdb/column_store/column_utils.hpp | 8 +- cpp/arcticdb/entity/descriptors.hpp | 14 ++ cpp/arcticdb/entity/timeseries_descriptor.hpp | 1 - cpp/arcticdb/entity/type_utils.cpp | 160 +++++++++++++ cpp/arcticdb/entity/type_utils.hpp | 152 ++---------- cpp/arcticdb/entity/types-inl.hpp | 5 +- cpp/arcticdb/entity/types.cpp | 10 +- cpp/arcticdb/entity/types.hpp | 23 +- cpp/arcticdb/pipeline/column_mapping.cpp | 137 +++++++++++ cpp/arcticdb/pipeline/column_mapping.hpp | 152 +++--------- cpp/arcticdb/pipeline/frame_utils.hpp | 7 +- cpp/arcticdb/pipeline/read_frame.cpp | 193 ++++++++------- cpp/arcticdb/pipeline/write_frame.cpp | 15 +- cpp/arcticdb/processing/aggregation.cpp | 5 - cpp/arcticdb/python/python_handlers.cpp | 74 +++--- cpp/arcticdb/python/python_handlers.hpp | 43 ++-- cpp/arcticdb/python/python_module.cpp | 1 - cpp/arcticdb/stream/index.hpp | 2 +- cpp/arcticdb/util/sparse_utils.hpp | 17 +- cpp/arcticdb/util/type_handler.hpp | 17 +- python/tests/conftest.py | 33 ++- .../version_store/test_empty_column_type.py | 220 +++++++++++++++--- 25 files changed, 816 insertions(+), 500 deletions(-) create mode 100644 cpp/arcticdb/entity/descriptors.hpp create mode 100644 cpp/arcticdb/entity/type_utils.cpp create mode 100644 cpp/arcticdb/pipeline/column_mapping.cpp diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 5b28097825..71e347ae62 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -195,6 +195,7 @@ set(arcticdb_srcs entity/key.hpp entity/metrics.hpp entity/data_error.hpp + entity/descriptors.hpp entity/native_tensor.hpp entity/performance_tracing.hpp entity/protobuf_mappings.hpp @@ -203,6 +204,7 @@ set(arcticdb_srcs entity/serialized_key.hpp entity/type_conversion.hpp entity/types.hpp + entity/type_utils.hpp entity/types-inl.hpp entity/variant_key.hpp entity/versioned_item.hpp @@ -370,7 +372,9 @@ set(arcticdb_srcs entity/metrics.cpp entity/performance_tracing.cpp entity/types.cpp + entity/type_utils.cpp log/log.cpp + pipeline/column_mapping.cpp pipeline/column_stats.cpp pipeline/frame_slice.cpp pipeline/frame_utils.cpp diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp index 23ba7cf152..4d62a6b4d7 100644 --- a/cpp/arcticdb/column_store/column.cpp +++ b/cpp/arcticdb/column_store/column.cpp @@ -90,14 +90,14 @@ void Column::unsparsify(size_t num_rows) { if(!sparse_map_) return; - type_.visit_tag([that=this, num_rows] (const auto tdt) { + type_.visit_tag([this, num_rows] (const auto tdt) { using TagType = decltype(tdt); using RawType = typename TagType::DataTypeTag::raw_type; const auto dest_bytes = num_rows * sizeof(RawType); auto dest = ChunkedBuffer::presized(dest_bytes); util::default_initialize(dest.data(), dest_bytes); - util::expand_dense_buffer_using_bitmap(that->sparse_map_.value(), that->data_.buffer().data(), dest.data()); - std::swap(dest, that->data_.buffer()); + util::expand_dense_buffer_using_bitmap(sparse_map_.value(), data_.buffer().data(), dest.data()); + std::swap(dest, data_.buffer()); }); sparse_map_ = std::nullopt; last_logical_row_ = last_physical_row_ = static_cast(num_rows) - 1; @@ -106,14 +106,13 @@ void Column::unsparsify(size_t num_rows) { } void Column::sparsify() { - type().visit_tag([that=this](auto type_desc_tag) { - using TDT = decltype(type_desc_tag); - using RawType = typename TDT::DataTypeTag::raw_type; - if constexpr(is_floating_point_type(TDT::DataTypeTag::data_type)) { - auto raw_ptr =reinterpret_cast(that->ptr()); - auto buffer = util::scan_floating_point_to_sparse(raw_ptr, that->row_count(), that->sparse_map()); - std::swap(that->data().buffer(), buffer); - that->last_physical_row_ = that->sparse_map().count() - 1; + type().visit_tag([this](auto type_desc_tag) { + using RawType = typename decltype(type_desc_tag)::DataTypeTag::raw_type; + if constexpr (is_floating_point_type(type_desc_tag.data_type())) { + auto raw_ptr = reinterpret_cast(ptr()); + auto buffer = util::scan_floating_point_to_sparse(raw_ptr, row_count(), sparse_map()); + std::swap(data().buffer(), buffer); + last_physical_row_ = sparse_map().count() - 1; } }); } diff --git a/cpp/arcticdb/column_store/column.hpp b/cpp/arcticdb/column_store/column.hpp index de72357498..b8f69d1599 100644 --- a/cpp/arcticdb/column_store/column.hpp +++ b/cpp/arcticdb/column_store/column.hpp @@ -217,7 +217,7 @@ class Column { // it pointers to an array type (at numpy arrays at the moment). When we allocate a column for an array we // need to allocate space for one pointer per row. This also affects how we handle arrays to python as well. // Check cpp/arcticdb/column_store/column_utils.hpp::array_at and cpp/arcticdb/column_store/column.hpp::Column - data_(expected_rows * (type.dimension() == Dimension::Dim0 ? get_type_size(type.data_type()) : sizeof(void*)), presize), + data_(expected_rows * entity::sizeof_datatype(type), presize), type_(type), allow_sparse_(allow_sparse){ ARCTICDB_TRACE(log::inmem(), "Creating column with descriptor {}", type); diff --git a/cpp/arcticdb/column_store/column_utils.hpp b/cpp/arcticdb/column_store/column_utils.hpp index 71fc1192a9..351617a62b 100644 --- a/cpp/arcticdb/column_store/column_utils.hpp +++ b/cpp/arcticdb/column_store/column_utils.hpp @@ -53,9 +53,7 @@ inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, py: // (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer. // This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and // cpp/arcticdb/pipeline/column_mapping.hpp::sizeof_datatype - if constexpr(tag.dimension() > Dimension::Dim0) { - esize = sizeof(PyObject*); - } + esize = sizeof_datatype(TypeDescriptor{tag}); } else if constexpr(tag.dimension() == Dimension::Dim2) { util::raise_rte("Read resulted in two dimensional type. This is not supported."); } else { @@ -105,9 +103,7 @@ inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, py: // (numpy) array, thus the size of the element is not the size of the raw type, but the size of a pointer. // This also affects how we allocate columns. Check cpp/arcticdb/column_store/column.hpp::Column and // cpp/arcticdb/pipeline/column_mapping.hpp::sizeof_datatype - if constexpr(tag.dimension() > Dimension::Dim0) { - esize = sizeof(PyObject*); - } + esize = sizeof_datatype(TypeDescriptor{tag}); } else if constexpr(tag.dimension() == Dimension::Dim2) { util::raise_rte("Read resulted in two dimensional type. This is not supported."); } else { diff --git a/cpp/arcticdb/entity/descriptors.hpp b/cpp/arcticdb/entity/descriptors.hpp new file mode 100644 index 0000000000..87ffaa041f --- /dev/null +++ b/cpp/arcticdb/entity/descriptors.hpp @@ -0,0 +1,14 @@ +/* Copyright 2024 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software + * will be governed by the Apache License, version 2.0. + */ + +#pragma once + +#include +namespace arcticdb::proto { + namespace descriptors = arcticc::pb2::descriptors_pb2; +} \ No newline at end of file diff --git a/cpp/arcticdb/entity/timeseries_descriptor.hpp b/cpp/arcticdb/entity/timeseries_descriptor.hpp index 806051f129..3802fa831d 100644 --- a/cpp/arcticdb/entity/timeseries_descriptor.hpp +++ b/cpp/arcticdb/entity/timeseries_descriptor.hpp @@ -7,7 +7,6 @@ #pragma once -#include #include #include diff --git a/cpp/arcticdb/entity/type_utils.cpp b/cpp/arcticdb/entity/type_utils.cpp new file mode 100644 index 0000000000..715939590d --- /dev/null +++ b/cpp/arcticdb/entity/type_utils.cpp @@ -0,0 +1,160 @@ +/* Copyright 2024 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software + * will be governed by the Apache License, version 2.0. + */ + +#include +#include + +namespace arcticdb { + bool trivially_compatible_types(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right) { + if (left == right) + return true; + + // Multidimensional types are pointers + if (left.dimension() >= entity::Dimension::Dim1 && right.dimension() >= entity::Dimension::Dim1) + return true; + + // Multidimensional types are pointers the empty type is pointer as well + if (left.dimension() >= entity::Dimension::Dim1 && is_empty_type(right.data_type())) + return true; + + // Multidimensional types are pointers the empty type is pointer as well + if (right.dimension() >= entity::Dimension::Dim1 && is_empty_type(left.data_type())) + return true; + + if (is_sequence_type(left.data_type()) && is_sequence_type(right.data_type())) { + // TODO coercion of utf strings is not always safe, should allow safe conversion and reinstate the + // stronger requirement for trivial conversion below. + // if(!is_utf_type(slice_value_type(left.data_type)) && + // !is_utf_type(slice_value_type(right.data_type))) + // return true; + + return is_utf_type(slice_value_type(left.data_type())) == is_utf_type(slice_value_type(right.data_type())); + } + + return false; + } + + std::optional has_valid_type_promotion( + const entity::TypeDescriptor& source, + const entity::TypeDescriptor& target + ) { + + if (source.dimension() != target.dimension()) { + // Empty of dimension 0 means lack of any given type and can be promoted to anything (even if the dimensions + // don't match), e.g. empty type can become int or array of ints. Empty type of higher dimension is used to + // specify an empty array or an empty matrix, thus it cannot become any other type unless the dimensionality + // matches + if (is_empty_type(source.data_type()) && source.dimension() == entity::Dimension::Dim0) + return target; + return std::nullopt; + } + + if (source == target) + return target; + + // Empty type is coercible to any type + if (is_empty_type(source.data_type())) { + return target; + } + + // Nothing is coercible to the empty type. + if (is_empty_type(target.data_type())) { + return std::nullopt; + } + + auto source_type = source.data_type(); + auto target_type = target.data_type(); + auto source_size = slice_bit_size(source_type); + auto target_size = slice_bit_size(target_type); + + if (is_time_type(source_type)) { + if (!is_time_type(target_type)) + return std::nullopt; + } else if (is_unsigned_type(source_type)) { + if (is_unsigned_type(target_type)) { + // UINT->UINT, target_size must be >= source_size + if (source_size > target_size) + return std::nullopt; + } else if (is_signed_type(target_type)) { + // UINT->INT, target_size must be > source_size + if (source_size >= target_size) + return std::nullopt; + } else if (is_floating_point_type(target_type)) { + // UINT->FLOAT, no restrictions on relative sizes + } else { + // Non-numeric target type + return std::nullopt; + } + } else if (is_signed_type(source_type)) { + if (is_unsigned_type(target_type)) { + // INT->UINT never promotable + return std::nullopt; + } else if (is_signed_type(target_type)) { + // INT->INT, target_size must be >= source_size + if (source_size > target_size) + return std::nullopt; + } else if (is_floating_point_type(target_type)) { + // INT->FLOAT, no restrictions on relative sizes + } else { + // Non-numeric target type + return std::nullopt; + } + } else if (is_floating_point_type(source_type)) { + if (is_unsigned_type(target_type) || is_signed_type(target_type)) { + // FLOAT->U/INT never promotable + return std::nullopt; + } else if (is_floating_point_type(target_type)) { + // FLOAT->FLOAT, target_size must be >= source_size + if (source_size > target_size) + return std::nullopt; + } else { + // Non-numeric target type + return std::nullopt; + } + } else if (is_sequence_type(source_type) && is_sequence_type(target_type)) { + // Only allow promotion with UTF strings, and only to dynamic (never to fixed width) + if (!is_utf_type(source_type) || !is_utf_type(target_type) || !is_dynamic_string_type(target_type)) + return std::nullopt; + } else if (is_py_bool_type(source_type)) { + return std::nullopt; + } else { + // Non-numeric source type + return std::nullopt; + } + + return entity::TypeDescriptor{ + combine_data_type(slice_value_type(target_type), target_size), + target.dimension()}; + } + + std::optional has_valid_type_promotion( + const proto::descriptors::TypeDescriptor& source, + const proto::descriptors::TypeDescriptor& target + ) { + return has_valid_type_promotion(entity::type_desc_from_proto(source), entity::type_desc_from_proto(target)); + } + + std::optional has_valid_common_type( + const entity::TypeDescriptor& left, + const entity::TypeDescriptor& right + ) { + auto maybe_common_type = has_valid_type_promotion(left, right); + if (!maybe_common_type) { + maybe_common_type = has_valid_type_promotion(right, left); + } + return maybe_common_type; + } + + std::optional has_valid_common_type( + const proto::descriptors::TypeDescriptor& left, + const proto::descriptors::TypeDescriptor& right + ) { + return has_valid_common_type(entity::type_desc_from_proto(left), entity::type_desc_from_proto(right)); + } + +} \ No newline at end of file diff --git a/cpp/arcticdb/entity/type_utils.hpp b/cpp/arcticdb/entity/type_utils.hpp index 13804582f7..4ecc0f2606 100644 --- a/cpp/arcticdb/entity/type_utils.hpp +++ b/cpp/arcticdb/entity/type_utils.hpp @@ -6,146 +6,36 @@ */ #pragma once - -#include +#include +#include namespace arcticdb { -inline bool trivially_compatible_types(entity::TypeDescriptor left, entity::TypeDescriptor right) { - if(left == right) - return true; - - // Multidimensional types are pointers - if(left.dimension() >= entity::Dimension::Dim1 && right.dimension() >= entity::Dimension::Dim1) - return true; - - // Multidimensional types are pointer the empty type is pointer as well - if(left.dimension() >= entity::Dimension::Dim1 && is_empty_type(right.data_type())) - return true; - - // Multidimensional types are pointer the empty type is pointer as well - if(right.dimension() >= entity::Dimension::Dim1 && is_empty_type(left.data_type())) - return true; - - if(is_sequence_type(left.data_type()) && is_sequence_type(right.data_type())) { - //TODO coercion of utf strings is not always safe, should allow safe conversion and reinstate the - //stronger requirement for trivial conversion below. -// if(!is_utf_type(slice_value_type(left.data_type)) && !is_utf_type(slice_value_type(right.data_type))) -// return true; - - return is_utf_type(slice_value_type(left.data_type())) == is_utf_type(slice_value_type(right.data_type())); - } - - if(is_py_bool_type(right.data_type()) && is_py_bool_type(right.data_type())) { - return true; - } - - return false; +namespace entity { + struct TypeDescriptor; } -inline std::optional has_valid_type_promotion(entity::TypeDescriptor source, entity::TypeDescriptor target) { - - if(source.dimension() != target.dimension()) { - // Empty of dimension 0 means lack of any given type and can be promoted to anything (even if the dimensions don't - // match), e.g. empty type can become int or array of ints. Empty type of higher dimension is used to specify an - // empty array or an empty matrix, thus it cannot become any other type unless the dimensionality matches - if(is_empty_type(source.data_type()) && source.dimension() == entity::Dimension::Dim0) - return target; - return std::nullopt; - } - - if (source == target) - return target; +/// Two types are trivially compatible if their byte representation is exactly the same i.e. you can memcpy +/// n elements of left type from one buffer to n elements of type right in another buffer and get the same result +[[nodiscard]] bool trivially_compatible_types(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right); - // Empty type is coercible to any type - if(is_empty_type(source.data_type())) { - return target; - } +[[nodiscard]] std::optional has_valid_type_promotion( + const entity::TypeDescriptor& source, + const entity::TypeDescriptor& target +); - // Nothing is coercible to the empty type. - if(is_empty_type(target.data_type())) { - return std::nullopt; - } - - auto source_type = source.data_type(); - auto target_type = target.data_type(); - auto source_size = slice_bit_size(source_type); - auto target_size = slice_bit_size(target_type); - - if (is_time_type(source_type)) { - if (!is_time_type(target_type)) - return std::nullopt; - } else if (is_unsigned_type(source_type)) { - if (is_unsigned_type(target_type)) { - // UINT->UINT, target_size must be >= source_size - if (source_size > target_size) - return std::nullopt; - } else if (is_signed_type(target_type)) { - // UINT->INT, target_size must be > source_size - if (source_size >= target_size) - return std::nullopt; - } else if (is_floating_point_type(target_type)) { - // UINT->FLOAT, no restrictions on relative sizes - } else { - // Non-numeric target type - return std::nullopt; - } - } else if (is_signed_type(source_type)) { - if (is_unsigned_type(target_type)) { - // INT->UINT never promotable - return std::nullopt; - } else if (is_signed_type(target_type)) { - // INT->INT, target_size must be >= source_size - if (source_size > target_size) - return std::nullopt; - } else if (is_floating_point_type(target_type)) { - // INT->FLOAT, no restrictions on relative sizes - } else { - // Non-numeric target type - return std::nullopt; - } - } else if (is_floating_point_type(source_type)) { - if (is_unsigned_type(target_type) || is_signed_type(target_type)) { - // FLOAT->U/INT never promotable - return std::nullopt; - } else if (is_floating_point_type(target_type)) { - // FLOAT->FLOAT, target_size must be >= source_size - if (source_size > target_size) - return std::nullopt; - } else { - // Non-numeric target type - return std::nullopt; - } - } else if (is_sequence_type(source_type) && is_sequence_type(target_type)) { - // Only allow promotion with UTF strings, and only to dynamic (never to fixed width) - if (!is_utf_type(source_type) || !is_utf_type(target_type) || !is_dynamic_string_type(target_type)) - return std::nullopt; - } else { - // Non-numeric source type - return std::nullopt; - } - - return entity::TypeDescriptor{combine_data_type(slice_value_type(target_type), target_size), target.dimension()}; -} - -inline std::optional has_valid_type_promotion( +[[nodiscard]] std::optional has_valid_type_promotion( const proto::descriptors::TypeDescriptor& source, - const proto::descriptors::TypeDescriptor& target) { - return has_valid_type_promotion(entity::type_desc_from_proto(source), entity::type_desc_from_proto(target)); -} + const proto::descriptors::TypeDescriptor& target +); -inline std::optional has_valid_common_type(entity::TypeDescriptor left, entity::TypeDescriptor right) { - auto maybe_common_type = has_valid_type_promotion(left, right); - if (!maybe_common_type) { - maybe_common_type = has_valid_type_promotion(right, left); - } - return maybe_common_type; -} +[[nodiscard]] std::optional has_valid_common_type( + const entity::TypeDescriptor& left, + const entity::TypeDescriptor& right +); -inline std::optional has_valid_common_type( +[[nodiscard]] std::optional has_valid_common_type( const proto::descriptors::TypeDescriptor& left, - const proto::descriptors::TypeDescriptor& right) { - return has_valid_common_type(entity::type_desc_from_proto(left), entity::type_desc_from_proto(right)); -} - + const proto::descriptors::TypeDescriptor& right +); } diff --git a/cpp/arcticdb/entity/types-inl.hpp b/cpp/arcticdb/entity/types-inl.hpp index d9d80c87e0..cb16fafbdf 100644 --- a/cpp/arcticdb/entity/types-inl.hpp +++ b/cpp/arcticdb/entity/types-inl.hpp @@ -18,7 +18,7 @@ namespace arcticdb::entity { namespace details { template -auto visit_dim(DataType dt, Callable &&c) { +constexpr auto visit_dim(DataType dt, Callable &&c) { switch (dt) { #define DT_CASE(__T__) case DataType::__T__: \ return c(TypeDescriptorTag, DimType>()); @@ -40,7 +40,6 @@ auto visit_dim(DataType dt, Callable &&c) { DT_CASE(UTF_DYNAMIC64) DT_CASE(EMPTYVAL) DT_CASE(PYBOOL8) - DT_CASE(PYBOOL64) #undef DT_CASE default: util::raise_rte("Invalid dtype '{}' in visit dim", datatype_to_str(dt)); } @@ -77,7 +76,7 @@ auto visit_type(DataType dt, Callable &&c) { } // namespace details template -auto TypeDescriptor::visit_tag(Callable &&callable) const { +constexpr auto TypeDescriptor::visit_tag(Callable &&callable) const { switch (dimension_) { case Dimension::Dim0: return details::visit_dim>(data_type_, callable); case Dimension::Dim1: return details::visit_dim>(data_type_, callable); diff --git a/cpp/arcticdb/entity/types.cpp b/cpp/arcticdb/entity/types.cpp index 5da45658bd..4f7593b23a 100644 --- a/cpp/arcticdb/entity/types.cpp +++ b/cpp/arcticdb/entity/types.cpp @@ -6,6 +6,7 @@ */ #include +#include namespace arcticdb::entity { @@ -37,10 +38,17 @@ std::string_view datatype_to_str(const DataType dt) { TO_STR(UTF_DYNAMIC64) TO_STR(EMPTYVAL) TO_STR(PYBOOL8) - TO_STR(PYBOOL64) #undef TO_STR default:return std::string_view("UNKNOWN"); } } +std::size_t sizeof_datatype(const TypeDescriptor& td) { + return td.visit_tag([](auto tdt) { + using RawType = typename std::decay_t::DataTypeTag::raw_type; + auto handler = TypeHandlerRegistry::instance()->get_handler(TypeDescriptor{tdt}); + return handler ? handler->type_size() : sizeof(RawType); + }); +} + } // namespace arcticdb diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index 771a03d083..2a16a30df1 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -33,13 +34,6 @@ using ssize_t = SSIZE_T; #endif -// TODO: Forward declare further and move the inclusion of `descriptors.pb.h` in `types.cpp`. -#include - -namespace arcticdb::proto { - namespace descriptors = arcticc::pb2::descriptors_pb2; -} - namespace arcticdb::entity { enum class SortedValue : uint8_t { @@ -232,7 +226,6 @@ enum class DataType : uint8_t { UTF_DYNAMIC64 = detail::combine_val_bits(ValueType::UTF_DYNAMIC, SizeBits::S64), EMPTYVAL = detail::combine_val_bits(ValueType::EMPTY, SizeBits::S64), PYBOOL8 = detail::combine_val_bits(ValueType::PYBOOL, SizeBits::S8), - PYBOOL64 = detail::combine_val_bits(ValueType::PYBOOL, SizeBits::S64), UNKNOWN = 0, }; @@ -418,7 +411,6 @@ DATA_TYPE_TAG(UTF_FIXED64, std::uint64_t) DATA_TYPE_TAG(UTF_DYNAMIC64, std::uint64_t) DATA_TYPE_TAG(EMPTYVAL, std::uint64_t) DATA_TYPE_TAG(PYBOOL8, uint8_t) -DATA_TYPE_TAG(PYBOOL64, std::uint64_t) #undef DATA_TYPE_TAG enum class Dimension : uint8_t { @@ -474,7 +466,7 @@ struct TypeDescriptor { ARCTICDB_MOVE_COPY_DEFAULT(TypeDescriptor) template - auto visit_tag(Callable &&callable) const; + constexpr auto visit_tag(Callable &&callable) const; bool operator==(const TypeDescriptor &o) const { return data_type_ == o.data_type_ && dimension_ == o.dimension_; @@ -552,16 +544,20 @@ struct TypeDescriptorTag { using DataTypeTag = DT; using DimensionTag = D; explicit constexpr operator TypeDescriptor() const { - return TypeDescriptor{DataTypeTag::data_type, DimensionTag::value}; + return type_descriptor(); } - [[nodiscard]] constexpr Dimension dimension() const { + [[nodiscard]] static constexpr Dimension dimension() { return DimensionTag::value; } - [[nodiscard]] constexpr DataType data_type() const { + [[nodiscard]] static constexpr DataType data_type() { return DataTypeTag::data_type; } + + [[nodiscard]] static constexpr TypeDescriptor type_descriptor() { + return TypeDescriptor{DataTypeTag::data_type, DimensionTag::value}; + } }; inline DataType get_data_type(const arcticdb::proto::descriptors::TypeDescriptor& type_desc) { @@ -804,6 +800,7 @@ inline bool operator!=(const Field& l, const Field& r) { return !(l == r); } +std::size_t sizeof_datatype(const TypeDescriptor& td); } // namespace arcticdb::entity // StreamId ordering - numbers before strings diff --git a/cpp/arcticdb/pipeline/column_mapping.cpp b/cpp/arcticdb/pipeline/column_mapping.cpp new file mode 100644 index 0000000000..f348cb9f49 --- /dev/null +++ b/cpp/arcticdb/pipeline/column_mapping.cpp @@ -0,0 +1,137 @@ +/* Copyright 2024 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software + * will be governed by the Apache License, version 2.0. + */ + +#include +#include +#include + +namespace arcticdb { + ColumnMapping::ColumnMapping( + SegmentInMemory& frame, + size_t dst_col, + size_t field_col, + pipelines::PipelineContextRow& context + ) : + source_type_desc_(context.descriptor().fields(field_col).type()), + dest_type_desc_(frame.field(dst_col).type()), + frame_field_descriptor_(frame.field(dst_col)), + dest_size_(sizeof_datatype(dest_type_desc_)), + num_rows_(context.slice_and_key().slice_.row_range.diff()), + first_row_(context.slice_and_key().slice_.row_range.first - frame.offset()), + offset_bytes_(dest_size_ * first_row_), + dest_bytes_(dest_size_ * num_rows_) { + } + + StaticColumnMappingIterator::StaticColumnMappingIterator( + pipelines::PipelineContextRow& context, + size_t index_fieldcount + ) : + index_fieldcount_(index_fieldcount), + field_count_(context.slice_and_key().slice_.col_range.diff() + index_fieldcount), + first_slice_col_offset_(context.slice_and_key().slice_.col_range.first), + last_slice_col_offset_(context.slice_and_key().slice_.col_range.second), + bit_set_(context.get_selected_columns()) { + prev_col_offset_ = first_slice_col_offset_ - 1; + if (bit_set_) { + source_col_ = (*bit_set_)[bv_size(first_slice_col_offset_)] + ? first_slice_col_offset_ + : bit_set_->get_next(bv_size(first_slice_col_offset_)); + if ((*bit_set_)[bv_size(first_slice_col_offset_)]) { + source_col_ = first_slice_col_offset_; + } else { + auto next_pos = bit_set_->get_next(bv_size(first_slice_col_offset_)); + // We have to do this extra check in bitmagic, get_next returns 0 in case no next present + if (next_pos == 0 && bit_set_->size() > 0 && !bit_set_->test(0)) + invalid_ = true; + else + source_col_ = next_pos; + } + if (source_col_ < first_slice_col_offset_) + invalid_ = true; + + } else { + source_col_ = first_slice_col_offset_; + } + + dst_col_ = bit_set_ ? bit_set_->count_range(0, bv_size(source_col_)) - 1 : source_col_; + source_field_pos_ = (source_col_ - first_slice_col_offset_) + index_fieldcount_; + } + + std::optional StaticColumnMappingIterator::get_next_source_col() const { + if (!bit_set_) { + return source_col_ + 1; + } else { + auto next_pos = bit_set_->get_next(bv_size(source_col_)); + if (next_pos == 0) + return std::nullopt; + else + return next_pos; + } + } + + void StaticColumnMappingIterator::advance() { + ++dst_col_; + prev_col_offset_ = source_col_; + auto new_source_col = get_next_source_col(); + if (new_source_col) { + source_col_ = *new_source_col; + source_field_pos_ = (source_col_ - first_slice_col_offset_) + index_fieldcount_; + } else { + source_field_pos_ = field_count_; + source_col_ = last_slice_col_offset_; + } + } + + bool StaticColumnMappingIterator::invalid() const { + return invalid_; + } + + bool StaticColumnMappingIterator::has_next() const { + return source_field_pos_ < field_count_; + } + + bool StaticColumnMappingIterator::at_end_of_selected() const { + return !source_col_ || source_col_ >= last_slice_col_offset_; + } + + size_t StaticColumnMappingIterator::remaining_fields() const { + return field_count_ - source_field_pos_; + } + + size_t StaticColumnMappingIterator::prev_col_offset() const { + return prev_col_offset_; + } + + size_t StaticColumnMappingIterator::source_field_pos() const { + return source_field_pos_; + } + + size_t StaticColumnMappingIterator::source_col() const { + return source_col_; + } + + size_t StaticColumnMappingIterator::first_slice_col_offset() const { + return first_slice_col_offset_; + } + + size_t StaticColumnMappingIterator::last_slice_col_offset() const { + return last_slice_col_offset_; + } + + size_t StaticColumnMappingIterator::dest_col() const { + return dst_col_; + } + + size_t StaticColumnMappingIterator::field_count() const { + return field_count_; + } + + size_t StaticColumnMappingIterator::index_fieldcount() const { + return index_fieldcount_; + } +} // namespace arcticdb \ No newline at end of file diff --git a/cpp/arcticdb/pipeline/column_mapping.hpp b/cpp/arcticdb/pipeline/column_mapping.hpp index 3279ea7493..1f923623c2 100644 --- a/cpp/arcticdb/pipeline/column_mapping.hpp +++ b/cpp/arcticdb/pipeline/column_mapping.hpp @@ -5,41 +5,29 @@ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. */ +#pragma once #include +#include + +namespace arcticdb::pipelines { + struct PipelineContextRow; +} namespace arcticdb { -inline std::size_t sizeof_datatype(const TypeDescriptor &td) { - return td.visit_tag([](auto dt) { - using RawType = typename std::decay_t::DataTypeTag::raw_type; - // Array types are stored on disk as flat sequences. The python layer cannot work with this. We need to pass - // it pointers to an array type (at numpy arrays at the moment). When we allocate a column for an array we - // need to allocate space for one pointer per row. This also affects how we handle arrays to python as well. - // Check cpp/arcticdb/column_store/column_utils.hpp::array_at and cpp/arcticdb/column_store/column.hpp - return dt.dimension() == Dimension::Dim0 ? sizeof(RawType) : sizeof(void*); - }); -} +class SegmentInMemory; struct ColumnMapping { - const TypeDescriptor source_type_desc_; - const TypeDescriptor dest_type_desc_; - const Field& frame_field_descriptor_; + const entity::TypeDescriptor source_type_desc_; + const entity::TypeDescriptor dest_type_desc_; + const entity::Field& frame_field_descriptor_; const size_t dest_size_; const size_t num_rows_; const size_t first_row_; const size_t offset_bytes_; const size_t dest_bytes_; - ColumnMapping(SegmentInMemory &frame, size_t dst_col, size_t field_col, pipelines::PipelineContextRow &context) : - source_type_desc_(context.descriptor().fields(field_col).type()), - dest_type_desc_(frame.field(dst_col).type()), - frame_field_descriptor_(frame.field(dst_col)), - dest_size_(sizeof_datatype(dest_type_desc_)), - num_rows_(context.slice_and_key().slice_.row_range.diff()), - first_row_(context.slice_and_key().slice_.row_range.first - frame.offset()), - offset_bytes_(dest_size_ * first_row_), - dest_bytes_(dest_size_ * num_rows_) { - } + ColumnMapping(SegmentInMemory& frame, size_t dst_col, size_t field_col, pipelines::PipelineContextRow& context); }; struct StaticColumnMappingIterator { @@ -53,109 +41,23 @@ struct StaticColumnMappingIterator { size_t dst_col_ = 0; bool invalid_ = false; const std::optional& bit_set_; - - StaticColumnMappingIterator(pipelines::PipelineContextRow& context, size_t index_fieldcount) : - index_fieldcount_(index_fieldcount), - field_count_(context.slice_and_key().slice_.col_range.diff() + index_fieldcount), - first_slice_col_offset_(context.slice_and_key().slice_.col_range.first), - last_slice_col_offset_(context.slice_and_key().slice_.col_range.second), - bit_set_(context.get_selected_columns()) { - prev_col_offset_ = first_slice_col_offset_ - 1; - if(bit_set_) { - source_col_ = (*bit_set_)[bv_size(first_slice_col_offset_)] ? first_slice_col_offset_ : bit_set_->get_next(bv_size(first_slice_col_offset_)); - if ((*bit_set_)[bv_size(first_slice_col_offset_)]) { - source_col_ = first_slice_col_offset_; - } else { - auto next_pos = bit_set_->get_next(bv_size(first_slice_col_offset_)); - // We have to do this extra check in bitmagic, get_next returns 0 in case no next present - if (next_pos == 0 && bit_set_->size() > 0 && !bit_set_->test(0)) - invalid_ = true; - else - source_col_ = next_pos; - } - if(source_col_ < first_slice_col_offset_) - invalid_ = true; - - } else { - source_col_ = first_slice_col_offset_; - } - - dst_col_ = bit_set_ ? bit_set_->count_range(0, bv_size(source_col_)) - 1 : source_col_; - source_field_pos_ = (source_col_ - first_slice_col_offset_) + index_fieldcount_; - } - - std::optional get_next_source_col() const { - if(!bit_set_) { - return source_col_ + 1; - } else { - auto next_pos = bit_set_->get_next(bv_size(source_col_)); - if(next_pos == 0) - return std::nullopt; - else - return next_pos; - } - } - - void advance() { - ++dst_col_; - prev_col_offset_ = source_col_; - auto new_source_col = get_next_source_col(); - if(new_source_col) { - source_col_ = *new_source_col; - source_field_pos_ = (source_col_ - first_slice_col_offset_) + index_fieldcount_; - } else { - source_field_pos_ = field_count_; - source_col_ = last_slice_col_offset_; - } - } - - bool invalid() const { - return invalid_; - } - - bool has_next() const { - return source_field_pos_ < field_count_; - } - - bool at_end_of_selected() const { - return !source_col_ || source_col_ >= last_slice_col_offset_; - } - - size_t remaining_fields() const { - return field_count_ - source_field_pos_; - } - - size_t prev_col_offset() const { - return prev_col_offset_; - } - - size_t source_field_pos() const { - return source_field_pos_; - } - - size_t source_col() const { - return source_col_; - } - - size_t first_slice_col_offset() const { - return first_slice_col_offset_; - } - - size_t last_slice_col_offset() const { - return last_slice_col_offset_; - } - - size_t dest_col() const { - return dst_col_; - } - - size_t field_count() const { - return field_count_; - } - size_t index_fieldcount() const { - return index_fieldcount_; - } + StaticColumnMappingIterator(pipelines::PipelineContextRow& context, size_t index_fieldcount); + + void advance(); + [[nodiscard]] std::optional get_next_source_col() const; + [[nodiscard]] bool invalid() const; + [[nodiscard]] bool has_next() const; + [[nodiscard]] bool at_end_of_selected() const; + [[nodiscard]] size_t remaining_fields() const; + [[nodiscard]] size_t prev_col_offset() const; + [[nodiscard]] size_t source_field_pos() const; + [[nodiscard]] size_t source_col() const; + [[nodiscard]] size_t first_slice_col_offset() const; + [[nodiscard]] size_t last_slice_col_offset() const; + [[nodiscard]] size_t dest_col() const; + [[nodiscard]] size_t field_count() const; + [[nodiscard]] size_t index_fieldcount() const; }; diff --git a/cpp/arcticdb/pipeline/frame_utils.hpp b/cpp/arcticdb/pipeline/frame_utils.hpp index 11074a77b6..c764063509 100644 --- a/cpp/arcticdb/pipeline/frame_utils.hpp +++ b/cpp/arcticdb/pipeline/frame_utils.hpp @@ -209,12 +209,9 @@ std::optional aggregator_set_data( const auto num_values = bitset.count(); auto bool_buffer = ChunkedBuffer::presized(num_values * sizeof(uint8_t)); - auto en = bitset.first(); - auto en_end = bitset.end(); auto bool_ptr = bool_buffer.ptr_cast(0u, num_values); - while (en < en_end) { - *bool_ptr = static_cast(PyObject_IsTrue(ptr_data[*en])); - ++en; + for (auto it = bitset.first(); it < bitset.end(); ++it) { + *bool_ptr = static_cast(PyObject_IsTrue(ptr_data[*it])); ++bool_ptr; } if(bitset.count() > 0) diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index e8f2e90061..d62a037d17 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -56,27 +56,14 @@ void mark_index_slices( column_groups).value(); } -StreamDescriptor modify_output_sizes(StreamDescriptor&& desc) { - for(auto& field : desc.mutable_fields()) { - if(is_py_bool_type(field.type().data_type())) { - field.mutable_type_desc()->set_size_bits(SizeBits::S64); - } - } - return std::move(desc); -} - StreamDescriptor get_filtered_descriptor(StreamDescriptor&& descriptor, const std::shared_ptr& filter_columns) { // We assume here that filter_columns_ will always contain the index. auto desc = std::move(descriptor); auto index = stream::index_type_from_descriptor(desc); return util::variant_match(index, [&desc, &filter_columns] (const auto& idx) { - if(filter_columns) { - return modify_output_sizes(StreamDescriptor{index_descriptor(desc.id(), idx, *filter_columns)}); - } - else { - return modify_output_sizes(StreamDescriptor{index_descriptor(desc.id(), idx, *desc.fields_ptr())}); - } + const std::shared_ptr& fields = filter_columns ? filter_columns : desc.fields_ptr(); + return StreamDescriptor{index_descriptor(desc.id(), idx, *fields)}; }); } @@ -225,21 +212,20 @@ void decode_or_expand_impl( const uint8_t*& data, uint8_t* dest, const EncodedFieldType& encoded_field_info, - const TypeDescriptor& source_type_descriptor, - const TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encding_version) { - if(auto handler = TypeHandlerRegistry::instance()->get_handler(source_type_descriptor); handler) { + EncodingVersion encding_version, + const ColumnMapping& m +) { + if (auto handler = TypeHandlerRegistry::instance()->get_handler(m.source_type_desc_); handler) { handler->handle_type( data, dest, VariantField{&encoded_field_info}, - source_type_descriptor, - destination_type_descriptor, dest_bytes, std::move(buffers), - encding_version + encding_version, + m ); } else { std::optional bv; @@ -248,8 +234,8 @@ void decode_or_expand_impl( const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); ChunkedBuffer sparse{bytes}; SliceDataSink sparse_sink{sparse.data(), bytes}; - data += decode_field(source_type_descriptor, encoded_field_info, data, sparse_sink, bv, encding_version); - source_type_descriptor.visit_tag([dest, dest_bytes, &bv, &sparse](const auto tdt) { + data += decode_field(m.source_type_desc_, encoded_field_info, data, sparse_sink, bv, encding_version); + m.source_type_desc_.visit_tag([dest, dest_bytes, &bv, &sparse](const auto tdt) { using TagType = decltype(tdt); using RawType = typename TagType::DataTypeTag::raw_type; util::default_initialize(dest, dest_bytes); @@ -259,12 +245,12 @@ void decode_or_expand_impl( SliceDataSink sink(dest, dest_bytes); const auto &ndarray = encoded_field_info.ndarray(); if (const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); bytes < dest_bytes) { - source_type_descriptor.visit_tag([dest, bytes, dest_bytes](const auto tdt) { + m.source_type_desc_.visit_tag([dest, bytes, dest_bytes](const auto tdt) { using TagType = decltype(tdt); util::default_initialize(dest + bytes, dest_bytes - bytes); }); } - data += decode_field(source_type_descriptor, encoded_field_info, data, sink, bv, encding_version); + data += decode_field(m.source_type_desc_, encoded_field_info, data, sink, bv, encding_version); } } } @@ -289,22 +275,20 @@ void decode_or_expand( const uint8_t*& data, uint8_t* dest, const VariantField& variant_field, - const TypeDescriptor& source_type_descriptor, - const TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encoding_version + EncodingVersion encoding_version, + const ColumnMapping& m ) { util::variant_match(variant_field, [&](auto field) { decode_or_expand_impl( data, dest, *field, - source_type_descriptor, - destination_type_descriptor, dest_bytes, buffers, - encoding_version + encoding_version, + m ); }); } @@ -391,23 +375,29 @@ void decode_into_frame_static( ColumnMapping m{frame, it.dest_col(), it.source_field_pos(), context}; const bool types_trivially_compatible = trivially_compatible_types(m.source_type_desc_, m.dest_type_desc_); const bool any_type_is_empty = is_empty_type(m.source_type_desc_.data_type()) || is_empty_type(m.dest_type_desc_.data_type()); - util::check(types_trivially_compatible || any_type_is_empty, "Column type conversion from {} to {} not implemented in column {}:{} -> {}:{}", - m.source_type_desc_, - m.dest_type_desc_, - it.source_col(), - field_name, - it.dest_col(), - m.frame_field_descriptor_.name()); - util::check(data != end || remaining_fields_empty(it, context), "Reached end of input block with {} fields to decode", it.remaining_fields()); + util::check( + types_trivially_compatible || any_type_is_empty, + "Column type conversion from {} to {} not implemented in column {}:{} -> {}:{}", + m.source_type_desc_, + m.dest_type_desc_, + it.source_col(), + field_name, + it.dest_col(), + m.frame_field_descriptor_.name() + ); + util::check( + data != end || remaining_fields_empty(it, context), + "Reached end of input block with {} fields to decode", + it.remaining_fields() + ); decode_or_expand( data, buffer.data() + m.offset_bytes_, encoded_field, - m.source_type_desc_, - m.dest_type_desc_, m.dest_bytes_, buffers, - encoding_version + encoding_version, + m ); ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", field_name, data - begin); @@ -434,10 +424,11 @@ void decode_into_frame_static( } void decode_into_frame_dynamic( - SegmentInMemory &frame, - PipelineContextRow &context, - Segment &&s, - const std::shared_ptr& buffers) { + SegmentInMemory& frame, + PipelineContextRow& context, + Segment&& s, + const std::shared_ptr& buffers +) { ARCTICDB_SAMPLE_DEFAULT(DecodeIntoFrame) auto seg = std::move(s); const uint8_t *data = seg.buffer().data(); @@ -450,9 +441,11 @@ void decode_into_frame_dynamic( context.set_compacted(hdr.compacted()); const EncodingVersion encdoing_version = EncodingVersion(hdr.encoding_version()); const bool has_magic_numbers = encdoing_version == EncodingVersion::V2; + VariantEncodedFieldCollection fields(seg); - if (data != end) { - VariantEncodedFieldCollection fields(seg); + // data == end in case we have empty data types (e.g. {EMPTYVAL, Dim0}, {EMPTYVAL, Dim1}) for which we store nothing + // in storage as they can be reconstructed in the type handler on the read path. + if (data != end || fields.size() > 0) { auto index_field = fields.at(0u); decode_index_field(frame, index_field, data, begin, end, context, encdoing_version); @@ -469,53 +462,85 @@ void decode_into_frame_dynamic( auto dst_col = *frame_loc_opt; auto& buffer = frame.column(static_cast(dst_col)).data().buffer(); - if(ColumnMapping m{frame, dst_col, field_col, context};!trivially_compatible_types(m.source_type_desc_, m.dest_type_desc_)) { - util::check(static_cast(has_valid_type_promotion(m.source_type_desc_, m.dest_type_desc_)), "Can't promote type {} to type {} in field {}", - m.source_type_desc_, m.dest_type_desc_, m.frame_field_descriptor_.name()); - - m.dest_type_desc_.visit_tag([&buffer, &m, &data, encoded_field, buffers, encdoing_version] (auto dest_desc_tag) { - using DestinationType = typename decltype(dest_desc_tag)::DataTypeTag::raw_type; - m.source_type_desc_.visit_tag([&buffer, &m, &data, &encoded_field, &buffers, encdoing_version] (auto src_desc_tag ) { - using SourceType = typename decltype(src_desc_tag)::DataTypeTag::raw_type; - if constexpr (std::is_arithmetic_v && std::is_arithmetic_v) { - const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; + const ColumnMapping m{frame, dst_col, field_col, context}; + if(!trivially_compatible_types(m.source_type_desc_, m.dest_type_desc_)) { + util::check( + static_cast(has_valid_type_promotion(m.source_type_desc_, m.dest_type_desc_)), + "Can't promote type {} to type {} in field {}", + m.source_type_desc_, + m.dest_type_desc_, + m.frame_field_descriptor_.name() + ); + m.dest_type_desc_.visit_tag([&](auto dest_desc_tag) { + using DestinationType = typename decltype(dest_desc_tag)::DataTypeTag::raw_type; + m.source_type_desc_.visit_tag([&](auto src_desc_tag) { + using SourceType = typename decltype(src_desc_tag)::DataTypeTag::raw_type; + if constexpr (std::is_arithmetic_v && std::is_arithmetic_v) { + const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; + auto dest_ptr = reinterpret_cast(buffer.data() + m.offset_bytes_); + if constexpr (is_empty_type(src_desc_tag.data_type())) { + decode_or_expand( + data, + reinterpret_cast(dest_ptr), + encoded_field, + src_bytes, + buffers, + encdoing_version, + m + ); + } else { Buffer tmp_buf{src_bytes}; decode_or_expand( data, tmp_buf.data(), encoded_field, - m.source_type_desc_, - m.dest_type_desc_, src_bytes, buffers, - encdoing_version + encdoing_version, + m ); - auto src_ptr = reinterpret_cast(tmp_buf.data()); - auto dest_ptr = reinterpret_cast(buffer.data() + m.offset_bytes_); + auto src_ptr = reinterpret_cast(tmp_buf.data()); for (auto i = 0u; i < m.num_rows_; ++i) { *dest_ptr++ = static_cast(*src_ptr++); } - } else { - util::raise_rte("Can't promote type {} to type {} in field {}", m.source_type_desc_, m.dest_type_desc_, m.frame_field_descriptor_.name()); } - }); + } else { + util::raise_rte( + "Can't promote type {} to type {} in field {}", + m.source_type_desc_, + m.dest_type_desc_, + m.frame_field_descriptor_.name() + ); + } }); - ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", m.frame_field_descriptor_.name(), data - begin); + }); + ARCTICDB_TRACE( + log::codec(), + "Decoded column {} to position {}", + m.frame_field_descriptor_.name(), + data - begin + ); } else { - ARCTICDB_TRACE(log::storage(), "Creating data slice at {} with total size {} ({} rows)", m.offset_bytes_, m.dest_bytes_, - context.slice_and_key().slice_.row_range.diff()); - util::check(data != end, - "Reached end of input block with {} fields to decode", - field_count - field_col); + ARCTICDB_TRACE( + log::storage(), + "Creating data slice at {} with total size {} ({} rows)", + m.offset_bytes_, + m.dest_bytes_, + context.slice_and_key().slice_.row_range.diff() + ); + util::check( + data != end || is_empty_type(m.source_type_desc_.data_type()), + "Reached end of input block with {} fields to decode", + field_count - field_col + ); decode_or_expand( data, buffer.data() + m.offset_bytes_, encoded_field, - m.source_type_desc_, - m.dest_type_desc_, m.dest_bytes_, buffers, - encdoing_version + encdoing_version, + m ); } ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", frame.field(dst_col).name(), data - begin); @@ -1004,18 +1029,24 @@ class DynamicStringReducer : public StringReducer { const auto& segment_descriptor = context_row.descriptor(); const auto& segment_field = segment_descriptor[column_index]; - auto has_type_conversion = frame_field_.type() != segment_field.type(); - util::check(!has_type_conversion || trivially_compatible_types(frame_field_.type(), segment_field.type()), - "Cannot convert from type {} to {} in frame field", frame_field_.type(), segment_field.type()); + const bool trivially_compatible = trivially_compatible_types(frame_field_.type(), segment_field.type()); + // In case the segment type is EMPTYVAL the empty type handler should have ran and set all missing values + // to not_a_string() + util::check(trivially_compatible || is_empty_type(segment_field.type().data_type()), + "String types are not trivially compatible. Cannot convert from type {} to {} in frame field.", + frame_field_.type(), + segment_field.type() + ); auto is_utf = is_utf_type(slice_value_type(frame_field_.type().data_type())); size_t end = context_row.slice_and_key().slice_.row_range.second - frame_.offset(); + const auto is_type_different = frame_field_.type() != segment_field.type(); auto ptr_src = get_offset_ptr_at(row_, src_buffer_); if(do_lock_) - process_string_views(has_type_conversion, is_utf, end, ptr_src, context_row.string_pool()); + process_string_views(is_type_different, is_utf, end, ptr_src, context_row.string_pool()); else - process_string_views(has_type_conversion, is_utf, end, ptr_src, context_row.string_pool()); + process_string_views(is_type_different, is_utf, end, ptr_src, context_row.string_pool()); } void finalize() override { diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index 263e8aef0b..a3dfe27197 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -193,13 +193,24 @@ folly::Future append_frame( slices_to_write.insert(std::end(slices_to_write), std::make_move_iterator(std::begin(slice_and_keys_to_append)), std::make_move_iterator(std::end(slice_and_keys_to_append))); std::sort(std::begin(slices_to_write), std::end(slices_to_write)); if(dynamic_schema) { - auto merged_descriptor = - merge_descriptors(frame.desc, std::vector< std::shared_ptr>{ index_segment_reader.tsd().fields_ptr()}, {}); + auto merged_descriptor = merge_descriptors( + frame.desc, + std::vector>{index_segment_reader.tsd().fields_ptr()}, + {} + ); merged_descriptor.set_sorted(deduce_sorted(index_segment_reader.get_sorted(), frame.desc.get_sorted())); auto tsd = make_timeseries_descriptor(frame.num_rows + frame.offset, std::move(merged_descriptor), std::move(frame.norm_meta), std::move(frame.user_meta), std::nullopt, std::nullopt, frame.bucketize_dynamic); return index::write_index(stream::index_type_from_descriptor(frame.desc), std::move(tsd), std::move(slices_to_write), key, store); } else { + const FieldCollection& new_fields{index_segment_reader.tsd().fields()}; + for (int i = 0; i < new_fields.size(); ++i) { + const Field& new_field = new_fields.at(i); + TypeDescriptor& original_type = frame.desc.mutable_field(i).mutable_type(); + if (is_empty_type(original_type.data_type()) && !is_empty_type(new_field.type().data_type())) { + original_type = new_field.type(); + } + } frame.desc.set_sorted(deduce_sorted(index_segment_reader.get_sorted(), frame.desc.get_sorted())); return index::write_index(std::move(frame), std::move(slices_to_write), key, store); } diff --git a/cpp/arcticdb/processing/aggregation.cpp b/cpp/arcticdb/processing/aggregation.cpp index 11d8b61aba..14368b8edf 100644 --- a/cpp/arcticdb/processing/aggregation.cpp +++ b/cpp/arcticdb/processing/aggregation.cpp @@ -142,11 +142,6 @@ namespace struct OutputType, void> { using type = ScalarTagType>; }; - - template<> - struct OutputType, void> { - using type = ScalarTagType>; - }; } /********************** diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index b549083444..2f8d343c3a 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -61,37 +61,42 @@ namespace arcticdb { const uint8_t*& input, uint8_t* dest, const VariantField& variant_field, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr, - EncodingVersion + EncodingVersion, + const ColumnMapping& m ) { ARCTICDB_SAMPLE(HandleEmpty, 0) util::check(dest != nullptr, "Got null destination pointer"); - const size_t num_rows = dest_bytes / destination_type_descriptor.get_type_byte_size(); + ARCTICDB_TRACE( + log::version(), + "Empty type handler invoked for source type: {}, destination type: {}, num rows: {}", + m.source_type_desc_, + m.dest_type_desc_, + m.num_rows_ + ); static_assert(get_type_size(DataType::EMPTYVAL) == sizeof(PyObject*)); // TODO: arcticdb::util::default_initialize does mostly the same thing // the main difference is that it does not write py::none directly // * Should use that? // * It doesn't handle nullable bools and arrays? How should they be handled? - destination_type_descriptor.visit_tag([&](auto tag) { + m.dest_type_desc_.visit_tag([&](auto tag) { using RawType = typename decltype(tag)::DataTypeTag::raw_type; RawType* ptr_dest = reinterpret_cast(dest); if constexpr (is_sequence_type(tag.data_type())) { // not_a_string() is set here as strings are offset in the string pool // later in fix_and_reduce() the offsets are replaced by a real python strings // TODO: It would be best if the type handler sets this to py::none - std::fill_n(ptr_dest, num_rows, arcticdb::not_a_string()); + std::fill_n(ptr_dest, m.num_rows_, arcticdb::not_a_string()); } else if constexpr (is_nullable_type(TypeDescriptor(tag))) { - fill_with_none(reinterpret_cast(dest), num_rows); + fill_with_none(reinterpret_cast(dest), m.num_rows_); } else if constexpr (is_floating_point_type(tag.data_type())) { - std::fill_n(ptr_dest, num_rows, std::numeric_limits::quiet_NaN()); + std::fill_n(ptr_dest, m.num_rows_, std::numeric_limits::quiet_NaN()); } else if constexpr (is_integer_type(tag.data_type()) || is_bool_type(tag.data_type())) { - std::memset(dest, 0, sizeof(RawType) * num_rows); + std::memset(dest, 0, sizeof(RawType) * m.num_rows_); } else if constexpr (is_time_type(tag.data_type())) { - std::fill_n(ptr_dest, num_rows, arcticdb::util::NaT); + std::fill_n(ptr_dest, m.num_rows_, arcticdb::util::NaT); } else { static_assert(sizeof(tag) == 0, "Unhandled data_type"); } @@ -116,27 +121,29 @@ namespace arcticdb { }); } + int EmptyHandler::type_size() const { + return sizeof(PyObject*); + } + void BoolHandler::handle_type( - const uint8_t*& data, - uint8_t* dest, - const VariantField& encoded_field_info, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, - size_t dest_bytes, - std::shared_ptr, - EncodingVersion encding_version + const uint8_t*& data, + uint8_t* dest, + const VariantField& encoded_field_info, + size_t dest_bytes, + std::shared_ptr, + EncodingVersion encding_version, + const ColumnMapping& m ) { std::visit([&](const auto& field){ ARCTICDB_SAMPLE(HandleBool, 0) util::check(dest != nullptr, "Got null destination pointer"); util::check(field->has_ndarray(), "Bool handler expected array"); ARCTICDB_DEBUG(log::version(), "Bool handler got encoded field: {}", field->DebugString()); - size_t row_count = dest_bytes / get_type_size(DataType::PYBOOL64); auto ptr_dest = reinterpret_cast(dest); if(!field->ndarray().sparse_map_bytes()) { ARCTICDB_DEBUG(log::version(), "Bool handler has no values"); - fill_with_none(ptr_dest, row_count); + fill_with_none(ptr_dest, m.num_rows_); return; } @@ -145,7 +152,7 @@ namespace arcticdb { const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); auto sparse = ChunkedBuffer::presized(bytes); SliceDataSink sparse_sink{sparse.data(), bytes}; - data += decode_field(source_type_descriptor, *field, data, sparse_sink, bv, encding_version); + data += decode_field(m.source_type_desc_, *field, data, sparse_sink, bv, encding_version); const auto num_bools = bv->count(); auto ptr_src = sparse.template ptr_cast(0, num_bools * sizeof(uint8_t)); auto last_row = 0u; @@ -160,22 +167,25 @@ namespace arcticdb { ++last_row; util::check(ptr_begin + last_row == ptr_dest, "Boolean pointer out of alignment {} != {}", last_row, uintptr_t(ptr_begin + last_row)); } - fill_with_none(ptr_dest, row_count - last_row); + fill_with_none(ptr_dest, m.num_rows_ - last_row); util::check(ptr_begin + last_row == ptr_dest, "Boolean pointer out of alignment at end: {} != {}", last_row, uintptr_t(ptr_begin + last_row)); }, encoded_field_info); } + int BoolHandler::type_size() const { + return sizeof(PyObject*); + } + std::mutex ArrayHandler::initialize_array_mutex; void ArrayHandler::handle_type( const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field_info, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encoding_version + EncodingVersion encoding_version, + const ColumnMapping& m ) { util::variant_match(encoded_field_info, [&](auto field){ ARCTICDB_SAMPLE(HandleArray, 0) @@ -188,17 +198,17 @@ namespace arcticdb { fill_with_none(ptr_dest, row_count); return; } - std::shared_ptr column = buffers->get_buffer(source_type_descriptor, true); + std::shared_ptr column = buffers->get_buffer(m.source_type_desc_, true); column->check_magic(); log::version().info("Column got buffer at {}", uintptr_t(column.get())); auto bv = std::make_optional(util::BitSet{}); - data += decode_field(source_type_descriptor, *field, data, *column, bv, encoding_version); + data += decode_field(m.source_type_desc_, *field, data, *column, bv, encoding_version); auto last_row = 0u; ARCTICDB_SUBSAMPLE(InitArrayAcquireGIL, 0) - const auto strides = static_cast(get_type_size(source_type_descriptor.data_type())); - const py::dtype py_dtype = generate_python_dtype(source_type_descriptor, strides); - source_type_descriptor.visit_tag([&] (auto tdt) { + const auto strides = static_cast(get_type_size(m.source_type_desc_.data_type())); + const py::dtype py_dtype = generate_python_dtype(m.source_type_desc_, strides); + m.source_type_desc_.visit_tag([&] (auto tdt) { const auto& blocks = column->blocks(); if(blocks.empty()) return; @@ -236,4 +246,8 @@ namespace arcticdb { fill_with_none(ptr_dest, row_count - last_row); }); } + + int ArrayHandler::type_size() const { + return sizeof(PyObject*); + } } diff --git a/cpp/arcticdb/python/python_handlers.hpp b/cpp/arcticdb/python/python_handlers.hpp index 4c501bee5c..ac073ac69d 100644 --- a/cpp/arcticdb/python/python_handlers.hpp +++ b/cpp/arcticdb/python/python_handlers.hpp @@ -10,18 +10,21 @@ #include namespace arcticdb { - namespace py = pybind11; + struct ColumnMapping; + struct EmptyHandler { /// @see arcticdb::ITypeHandler void handle_type( const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encding_version); + EncodingVersion encding_version, + const ColumnMapping& columnMapping + ); + + int type_size() const; }; struct BoolHandler { @@ -30,26 +33,25 @@ namespace arcticdb { const uint8_t *&data, uint8_t *dest, const VariantField &encoded_field, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encding_version); + EncodingVersion encding_version, + const ColumnMapping& columnMapping + ); + int type_size() const; }; struct DecimalHandler { void handle_type( - const uint8_t*& data, - uint8_t* dest, - const VariantField& encoded_field, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, - size_t dest_bytes, - std::shared_ptr buffers + const uint8_t*& data, + uint8_t* dest, + const VariantField& encoded_field, + size_t dest_bytes, + std::shared_ptr buffers, + EncodingVersion encding_version, + const ColumnMapping& m ); - - std::shared_ptr mutex_ = std::make_shared(); - std::shared_ptr decimal_ = std::make_shared(py::module_::import("decimal").attr("Decimal")); + int type_size() const; }; struct ArrayHandler { @@ -58,11 +60,12 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encding_version); + EncodingVersion encding_version, + const ColumnMapping& columnMapping + ); + int type_size() const; static std::mutex initialize_array_mutex; }; } //namespace arcticdb diff --git a/cpp/arcticdb/python/python_module.cpp b/cpp/arcticdb/python/python_module.cpp index cd6c554a1c..cc061dc5fe 100644 --- a/cpp/arcticdb/python/python_module.cpp +++ b/cpp/arcticdb/python/python_module.cpp @@ -291,7 +291,6 @@ void register_type_handlers() { for(const DataType& data_type : allowed_array_types) { TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{data_type, Dimension::Dim1}, arcticdb::ArrayHandler()); } - TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{DataType::PYBOOL64, Dimension::Dim0}, arcticdb::BoolHandler()); TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{DataType::PYBOOL8, Dimension::Dim0}, arcticdb::BoolHandler()); } diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index 370e83d67e..006b695f67 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -82,7 +82,7 @@ class TimeseriesIndex : public BaseIndex { void check(const FieldCollection &fields) const { const size_t fields_size = fields.size(); - const int current_fields_size = int(field_count()); + constexpr int current_fields_size = int(field_count()); const TypeDescriptor &first_field_type = fields[0].type(); const TypeDescriptor ¤t_first_field_type = this->field(0).type(); diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp index 6eb892c1b3..5b17bcfe5a 100644 --- a/cpp/arcticdb/util/sparse_utils.hpp +++ b/cpp/arcticdb/util/sparse_utils.hpp @@ -75,19 +75,16 @@ template void default_initialize(uint8_t* data, size_t bytes) { using RawType = typename TagType::DataTypeTag::raw_type; const auto num_rows ARCTICDB_UNUSED = bytes / sizeof(RawType); - - constexpr auto data_type =TagType::DataTypeTag::data_type; + constexpr auto data_type = TagType::DataTypeTag::data_type; auto type_ptr ARCTICDB_UNUSED = reinterpret_cast(data); if constexpr (is_sequence_type(data_type)) { - std::fill(type_ptr, type_ptr + num_rows, not_a_string()); - } - else if constexpr (is_floating_point_type(data_type)) { - std::fill(type_ptr, type_ptr + num_rows, std::numeric_limits::quiet_NaN()); - } - else if constexpr (is_time_type(data_type)) { - std::fill(type_ptr, type_ptr + num_rows, NaT); + std::fill_n(type_ptr, num_rows, not_a_string()); + } else if constexpr (is_floating_point_type(data_type)) { + std::fill_n(type_ptr, num_rows, std::numeric_limits::quiet_NaN()); + } else if constexpr (is_time_type(data_type)) { + std::fill_n(type_ptr, num_rows, NaT); } else { - std::fill(data, data + bytes, 0); + std::fill_n(data, bytes, 0); } } diff --git a/cpp/arcticdb/util/type_handler.hpp b/cpp/arcticdb/util/type_handler.hpp index aa21f0e14f..8a3d167bcd 100644 --- a/cpp/arcticdb/util/type_handler.hpp +++ b/cpp/arcticdb/util/type_handler.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -32,28 +33,30 @@ struct ITypeHandler { const uint8_t*& source, uint8_t* dest, const VariantField& encoded_field_info, - const entity::TypeDescriptor& source_type_descriptor, - const entity::TypeDescriptor& destination_type_descriptor, size_t dest_bytes, std::shared_ptr buffers, - EncodingVersion encoding_version + EncodingVersion encoding_version, + const ColumnMapping& m ) { folly::poly_call<0>( *this, source, dest, encoded_field_info, - source_type_descriptor, - destination_type_descriptor, dest_bytes, buffers, - encoding_version + encoding_version, + m ); } + + int type_size() const { + return folly::poly_call<1>(*this); + } }; template - using Members = folly::PolyMembers<&T::handle_type>; + using Members = folly::PolyMembers<&T::handle_type, &T::type_size>; }; using TypeHandler = folly::Poly; diff --git a/python/tests/conftest.py b/python/tests/conftest.py index fd6d2b1e4a..f925ccd1d1 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -403,14 +403,15 @@ def lmdb_version_store_v2(version_store_factory, lib_name): return version_store_factory(dynamic_strings=True, encoding_version=int(EncodingVersion.V2), name=library_name) -@pytest.fixture -def lmdb_version_store(lmdb_version_store_v1, lmdb_version_store_v2, encoding_version): - if encoding_version == EncodingVersion.V1: - return lmdb_version_store_v1 - elif encoding_version == EncodingVersion.V2: - return lmdb_version_store_v2 - else: - raise ValueError(f"Unexpected encoding version: {encoding_version}") +@pytest.fixture( + scope="function", + params=( + "lmdb_version_store_v1", + "lmdb_version_store_v2", + ) +) +def lmdb_version_store(request): + yield request.getfixturevalue(request.param) @pytest.fixture @@ -657,6 +658,22 @@ def get_df(ts, width, max_col_width): return get_df +@pytest.fixture( + scope="function", + params=( + "lmdb_version_store_v1", + "lmdb_version_store_v2", + "lmdb_version_store_dynamic_schema_v1", + "lmdb_version_store_dynamic_schema_v2", + ), +) +def lmdb_version_store_static_and_dynamic(request): + """ + Designed to test all combinations between schema and encoding version for LMDB + """ + yield request.getfixturevalue(request.param) + + @pytest.fixture( scope="function", params=( diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index bd9b03e45a..9cccbea962 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -22,7 +22,7 @@ def float_dtype(request): # object is for nullable boolean -@pytest.fixture(params=["bool", "object"]) +@pytest.fixture(params=["object", "bool"]) def boolean_dtype(request): yield request.param @@ -30,57 +30,201 @@ def boolean_dtype(request): class TestCanAppendToEmptyColumn: @pytest.fixture(autouse=True) - def create_empty_column(self, lmdb_version_store): - df_empty = pd.DataFrame({"col1": [None, None]}) - lmdb_version_store.write("sym", df_empty) + def create_empty_column(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": [None, None]})) yield - def test_integer(self, lmdb_version_store, int_dtype): + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) - lmdb_version_store.append("sym", df_non_empty) + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([0,0,1,2,3], dtype=int_dtype)}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - - def test_float(self, lmdb_version_store, float_dtype): + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([0,0], dtype=int_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): df_non_empty = pd.DataFrame({"col1": np.array([1,2,3], dtype=float_dtype)}) - lmdb_version_store.append("sym", df_non_empty) + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([float("NaN"),float("NaN"),1,2,3], dtype=float_dtype)}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - - def test_bool(self, lmdb_version_store, boolean_dtype): + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([float("NaN"),float("NaN")], dtype=float_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): df_non_empty = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) - lmdb_version_store.append("sym", df_non_empty) + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([None, None, True, False, True], dtype=boolean_dtype)}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - - def test_empty(self, lmdb_version_store): + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([None, None], dtype=boolean_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) + + def test_empty(self, lmdb_version_store_static_and_dynamic): df_non_empty = pd.DataFrame({"col1": np.array([None, None, None])}) - lmdb_version_store.append("sym", df_non_empty) + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([None, None, None, None, None])}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - - def test_string(self, lmdb_version_store): + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([None, None])}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) + + def test_string(self, lmdb_version_store_static_and_dynamic): df_non_empty = pd.DataFrame({"col1": np.array(["some_string", "long_string"*100])}) - lmdb_version_store.append("sym", df_non_empty) + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([None, None, "some_string", "long_string"*100])}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) - - def test_date(self, lmdb_version_store): - df_non_empty = pd.DataFrame({"col1": np.array([np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')])}) - lmdb_version_store.append("sym", df_non_empty) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([None, None])}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) + + def test_date(self, lmdb_version_store_static_and_dynamic): + df_non_empty = pd.DataFrame({"col1": np.array([np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')])}, dtype="datetime64[ns]") + lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT'), np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')], dtype="datetime64[ns]")}) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, + pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype="datetime64[ns]")}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, + df_non_empty + ) class TestCanAppendEmptyToColumn: - def test_integer(self, lmdb_version_store_v2): - int_dtype = "int32" - lmdb_version_store = lmdb_version_store_v2 - df = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) - lmdb_version_store.write("sym", df) - lmdb_version_store.append("sym", pd.DataFrame({"col1": [None, None]})) - lmdb_version_store.append("sym", pd.DataFrame({"col1": [None, None]})) - expected_result = pd.DataFrame({"col1": np.array([1,2,3,0,0,0,0], dtype=int_dtype)}) - print(lmdb_version_store.read("sym", row_range=[5,7]).data) - assert_frame_equal(lmdb_version_store.read("sym").data, expected_result) \ No newline at end of file + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): + df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) + lmdb_version_store_static_and_dynamic.write("sym", df_initial) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": [None, None]})) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col1": np.array([1,2,3,0,0], dtype=int_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, + df_initial + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, + pd.DataFrame({"col1": np.array([0,0], dtype=int_dtype)}) + ) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): + df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=float_dtype)}) + lmdb_version_store_static_and_dynamic.write("sym", df_initial) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": [None, None]})) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col1": np.array([1,2,3,float("NaN"),float("NaN")], dtype=float_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, + df_initial + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, + pd.DataFrame({"col1": np.array([float("NaN"), float("NaN")], dtype=float_dtype)}) + ) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + df_initial = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) + df_with_none = pd.DataFrame({"col1": np.array([None, None])}) + lmdb_version_store_static_and_dynamic.write("sym", df_initial) + lmdb_version_store_static_and_dynamic.append("sym", df_with_none) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col1": np.array([True, False, True, None, None], dtype=boolean_dtype)}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, + df_initial + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, + pd.DataFrame(df_with_none, dtype=boolean_dtype) + ) + + def test_string(self, lmdb_version_store_static_and_dynamic): + df_initial = pd.DataFrame({"col1": np.array(["some_string", "long_string"*100, ""])}) + df_with_none = pd.DataFrame({"col1": np.array([None, None])}) + lmdb_version_store_static_and_dynamic.write("sym", df_initial) + lmdb_version_store_static_and_dynamic.append("sym", df_with_none) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col1": np.array(["some_string", "long_string"*100, "", None, None])}) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, + df_initial + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, + df_with_none + ) + + def test_date(self, lmdb_version_store_static_and_dynamic): + df_initial = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ] + ) + }, + dtype="datetime64[ns]" + ) + lmdb_version_store_static_and_dynamic.write("sym", df_initial) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": np.array([None, None])})) + expected_result = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03'), + np.datetime64('NaT'), + np.datetime64('NaT') + ], + dtype="datetime64[ns]") + } + ) + assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,3]).data, + df_initial + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, + pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype="datetime64[ns]")}) + ) \ No newline at end of file From 840d8993cd2ee4eee353c5637c1472cb3f8c8466 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 25 Jan 2024 10:38:27 +0200 Subject: [PATCH 05/30] Use boost span for flatten_and_fix_rows --- cpp/arcticdb/pipeline/write_frame.cpp | 20 +++++++++++--------- cpp/arcticdb/pipeline/write_frame.hpp | 10 ++++------ cpp/arcticdb/version/version_core.cpp | 14 ++++++-------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index a3dfe27197..b49caae4b7 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -28,8 +28,6 @@ #include #include -#include - namespace arcticdb::pipelines { using namespace arcticdb::entity; @@ -299,19 +297,23 @@ std::optional rewrite_partial_segment( } } -std::vector flatten_and_fix_rows(const std::vector>& groups, size_t& global_count) { +std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count) { std::vector output; global_count = 0; - for(auto group : groups) { - if(group.empty()) continue; + for (const std::vector& group : groups) { + if (group.empty()) + continue; auto group_start = group.begin()->slice_.row_range.first; - auto group_end = std::accumulate(std::begin(group), std::end(group), 0ULL, [](size_t a, const SliceAndKey& sk) { return std::max(a, sk.slice_.row_range.second); }); - std::transform(std::begin(group), std::end(group), std::back_inserter(output), [&] (auto& sk) { + auto group_end = std::accumulate(std::begin(group), std::end(group), 0ULL, [](size_t a, const SliceAndKey& sk) { + return std::max(a, sk.slice_.row_range.second); + }); + std::transform(std::begin(group), std::end(group), std::back_inserter(output), [&](SliceAndKey sk) { auto range_start = global_count + (sk.slice_.row_range.first - group_start); auto new_range = RowRange{range_start, range_start + (sk.slice_.row_range.diff())}; sk.slice_.row_range = new_range; - return sk; }); - global_count += (group_end - group_start) ; + return sk; + }); + global_count += (group_end - group_start); } return output; } diff --git a/cpp/arcticdb/pipeline/write_frame.hpp b/cpp/arcticdb/pipeline/write_frame.hpp index 386a8181d4..52e4b69b6b 100644 --- a/cpp/arcticdb/pipeline/write_frame.hpp +++ b/cpp/arcticdb/pipeline/write_frame.hpp @@ -7,12 +7,9 @@ #pragma once -#include -#include #include #include -#include #include #include #include @@ -22,6 +19,8 @@ #include #include +#include + namespace arcticdb::pipelines { using namespace arcticdb::stream; @@ -70,8 +69,7 @@ std::optional rewrite_partial_segment( bool before, const std::shared_ptr& store); -std::vector flatten_and_fix_rows( - const std::vector>& groups, - size_t& global_count); +// TODO: Use std::span when C++20 is enabled +std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count); } //namespace arcticdb::pipelines \ No newline at end of file diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index faddb9735f..fb304c9a30 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -264,13 +264,12 @@ VersionedItem delete_range_impl( auto orig_filter_range = std::holds_alternative(query.row_filter) ? get_query_index_range(index, index_range) : query.row_filter; size_t row_count = 0; - auto flattened_slice_and_keys = flatten_and_fix_rows(std::vector>{ + const std::array, 5> groups{ strictly_before(orig_filter_range, unaffected_keys), std::move(intersect_before), std::move(intersect_after), - strictly_after(orig_filter_range, unaffected_keys)}, - row_count - ); + strictly_after(orig_filter_range, unaffected_keys)}; + auto flattened_slice_and_keys = flatten_and_fix_rows(boost::span{groups}, row_count); std::sort(std::begin(flattened_slice_and_keys), std::end(flattened_slice_and_keys)); bool bucketize_dynamic = index_segment_reader.bucketize_dynamic(); @@ -367,14 +366,13 @@ VersionedItem update_impl( size_t row_count = 0; const size_t new_keys_size = new_slice_and_keys.size(); - auto flattened_slice_and_keys = flatten_and_fix_rows(std::vector>{ + const std::array, 5> groups{ strictly_before(orig_filter_range, unaffected_keys), std::move(intersect_before), std::move(new_slice_and_keys), std::move(intersect_after), - strictly_after(orig_filter_range, unaffected_keys)}, - row_count - ); + strictly_after(orig_filter_range, unaffected_keys)}; + auto flattened_slice_and_keys = flatten_and_fix_rows(boost::span{groups}, row_count); util::check(unaffected_keys.size() + new_keys_size + (affected_keys.size() * 2) >= flattened_slice_and_keys.size(), "Output size mismatch: {} + {} + (2 * {}) < {}", From 1c97adfed36e98248838a0ecbf1b971a8ab6a5af Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 25 Jan 2024 11:34:04 +0200 Subject: [PATCH 06/30] Use TypeDescriptor constructor instead of static_cast --- cpp/arcticdb/entity/merge_descriptors.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/entity/merge_descriptors.cpp b/cpp/arcticdb/entity/merge_descriptors.cpp index 0be36d5c74..ed8ea636cf 100644 --- a/cpp/arcticdb/entity/merge_descriptors.cpp +++ b/cpp/arcticdb/entity/merge_descriptors.cpp @@ -32,11 +32,11 @@ StreamDescriptor merge_descriptors( util::variant_match(temp_idx, [&merged_fields, &merged_fields_map] (const auto& idx) { using IndexType = std::decay_t; merged_fields.emplace_back(idx.name()); - merged_fields_map.try_emplace(idx.name(), static_cast(typename IndexType::TypeDescTag{})); + merged_fields_map.try_emplace(idx.name(), TypeDescriptor{typename IndexType::TypeDescTag{}}); }); - } - else + } else { util::raise_rte("Descriptor has uninitialized index and no default supplied"); + } } else { index = index_type_from_descriptor(original); } From 56385851d2e7eb0f8d75324f24f9ff36321565e7 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 29 Jan 2024 18:38:50 +0200 Subject: [PATCH 07/30] Use truncate columns on update and fix pybool type handler so that if sparse map is missing it assumes all values are there --- cpp/arcticdb/column_store/memory_segment.hpp | 5 +- .../column_store/memory_segment_impl.cpp | 32 +++- .../column_store/memory_segment_impl.hpp | 18 ++- cpp/arcticdb/pipeline/filter_segment.hpp | 8 - cpp/arcticdb/pipeline/string_pool_utils.hpp | 12 +- cpp/arcticdb/pipeline/write_frame.cpp | 149 +++++++++--------- cpp/arcticdb/pipeline/write_frame.hpp | 7 +- cpp/arcticdb/processing/clause.cpp | 2 +- cpp/arcticdb/processing/processing_unit.cpp | 2 +- cpp/arcticdb/python/python_handlers.cpp | 53 +++---- cpp/arcticdb/version/version_core.cpp | 20 ++- .../version_store/test_empty_column_type.py | 19 ++- 12 files changed, 193 insertions(+), 134 deletions(-) diff --git a/cpp/arcticdb/column_store/memory_segment.hpp b/cpp/arcticdb/column_store/memory_segment.hpp index 0e599955ec..0dc734ffc7 100644 --- a/cpp/arcticdb/column_store/memory_segment.hpp +++ b/cpp/arcticdb/column_store/memory_segment.hpp @@ -398,8 +398,9 @@ class SegmentInMemory { return SegmentInMemory(impl_->filter(filter_bitset, filter_down_stringpool, validate)); } - SegmentInMemory truncate(size_t start_row, size_t end_row) const{ - return SegmentInMemory(impl_->truncate(start_row, end_row)); + /// @see SegmentInMemoryImpl::truncate + SegmentInMemory truncate(size_t start_row, size_t end_row, bool reconstruct_string_pool) const{ + return SegmentInMemory(impl_->truncate(start_row, end_row, reconstruct_string_pool)); } std::vector partition(const std::vector>& row_to_segment, diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index efd3b17e4f..4be680acdc 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -418,8 +418,11 @@ bool operator==(const SegmentInMemoryImpl& left, const SegmentInMemoryImpl& righ return true; } -// Inclusive of start_row, exclusive of end_row -std::shared_ptr SegmentInMemoryImpl::truncate(size_t start_row, size_t end_row) const { +std::shared_ptr SegmentInMemoryImpl::truncate( + size_t start_row, + size_t end_row, + bool reconstruct_string_pool +) const { auto num_values = end_row - start_row; internal::check( is_sparse() || (start_row < row_count() && end_row <= row_count()), @@ -428,8 +431,10 @@ std::shared_ptr SegmentInMemoryImpl::truncate(size_t start_ auto output = std::make_shared(); output->set_row_data(num_values - 1); - output->set_string_pool(string_pool_); output->set_compacted(compacted_); + if (!reconstruct_string_pool) { + output->set_string_pool(string_pool_); + } if (metadata_) { google::protobuf::Any metadata; metadata.CopyFrom(*metadata_); @@ -437,8 +442,25 @@ std::shared_ptr SegmentInMemoryImpl::truncate(size_t start_ } for(const auto&& [idx, column] : folly::enumerate(columns_)) { - auto truncated_column = Column::truncate(column, start_row, end_row); - output->add_column(descriptor_->field(idx), truncated_column); + const DataType column_type = column->type().data_type(); + const Field& field = descriptor_->field(idx); + if (is_sequence_type(column_type) && reconstruct_string_pool) { + output->add_column(field, num_values, true); + ChunkedBuffer& target = output->column(idx).data().buffer(); + for (size_t row = 0u; row < num_values; ++row) { + util::variant_match( + get_string_from_buffer(row, column->data().buffer(), output->string_pool()), + [&](std::string_view sv) { + OffsetString off_str = output->string_pool().get(sv); + set_offset_string_at(row, target, off_str.offset()); + }, + [&](entity::position_t offset) { set_offset_string_at(row, target, offset); } + ); + } + } else { + auto truncated_column = Column::truncate(column, start_row, end_row); + output->add_column(field, truncated_column); + } } output->attach_descriptor(descriptor_); return output; diff --git a/cpp/arcticdb/column_store/memory_segment_impl.hpp b/cpp/arcticdb/column_store/memory_segment_impl.hpp index 71f07eb3b8..7e8cb743d8 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.hpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.hpp @@ -782,8 +782,8 @@ class SegmentInMemoryImpl { SegmentInMemoryImpl clone() const; - void set_string_pool(const std::shared_ptr& string_pool) { - string_pool_ = string_pool; + void set_string_pool(std::shared_ptr string_pool) { + string_pool_ = std::move(string_pool); } std::shared_ptr get_output_segment(size_t num_values, bool pre_allocate=true) const; @@ -808,8 +808,18 @@ class SegmentInMemoryImpl { void set_timeseries_descriptor(TimeseriesDescriptor&& tsd); - // Inclusive of start_row, exclusive of end_row - std::shared_ptr truncate(size_t start_row, size_t end_row) const; + /// @brief Construct a copy of the segment containing only rows in [start_row; end_row) + /// @param start_row Start of the row range (inclusive) + /// @param end_Row End of the row range (exclusive) + /// @param reconstruct_string_pool When truncating some of the strings values of the original + /// segment might not be referenced in the resulting segment. In this case, reconstructing the + /// string pool will save some memory. Note that reconstructing the string pool is an expensive + /// operation and should be avoided if possible. + std::shared_ptr truncate( + size_t start_row, + size_t end_row, + bool reconstruct_string_pool + ) const; // Partitions the segment into n new segments. Each row in the starting segment is mapped to one of the output segments // by the row_to_segment vector (std::nullopt means the row is not included in any output segment). diff --git a/cpp/arcticdb/pipeline/filter_segment.hpp b/cpp/arcticdb/pipeline/filter_segment.hpp index 1e6699a3b7..142d132bd0 100644 --- a/cpp/arcticdb/pipeline/filter_segment.hpp +++ b/cpp/arcticdb/pipeline/filter_segment.hpp @@ -9,8 +9,6 @@ #include #include -#include -#include namespace arcticdb { @@ -21,12 +19,6 @@ inline SegmentInMemory filter_segment(const SegmentInMemory& input, return input.filter(filter_bitset, filter_down_stringpool, validate); } -inline SegmentInMemory truncate_segment(const SegmentInMemory& input, - size_t start, - size_t end) { - return input.truncate(start, end); -} - inline std::vector partition_segment(const SegmentInMemory& input, const std::vector>& row_to_segment, const std::vector& segment_counts) { diff --git a/cpp/arcticdb/pipeline/string_pool_utils.hpp b/cpp/arcticdb/pipeline/string_pool_utils.hpp index 3a1212bc18..fac34f7894 100644 --- a/cpp/arcticdb/pipeline/string_pool_utils.hpp +++ b/cpp/arcticdb/pipeline/string_pool_utils.hpp @@ -36,8 +36,16 @@ inline auto get_string_from_pool(entity::position_t offset_val, const StringPool return string_pool.get_const_view(offset_val); } -inline std::variant get_string_from_buffer(size_t offset, const ChunkedBuffer& src, const StringPool& string_pool) { - auto offset_val = get_offset_string_at(offset, src); +/// @brief Get the i-th string from a buffer +/// @param string_pos The index of the string to be returned +/// @return If the string at @p string_pos is actual string inside @p string_pool return a +/// string view, otherwise return an (integer) placeholder representing not a string. +inline std::variant get_string_from_buffer( + size_t string_pos, + const ChunkedBuffer& src, + const StringPool& string_pool +) { + auto offset_val = get_offset_string_at(string_pos, src); if (offset_val == nan_placeholder() || offset_val == not_a_string()) return offset_val; else diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index b49caae4b7..1afc384227 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -215,86 +215,87 @@ folly::Future append_frame( }); } -void update_string_columns(const SegmentInMemory& original, SegmentInMemory output) { - util::check(original.descriptor() == output.descriptor(), "Update string column handling expects identical descriptors"); - for (size_t column = 0; column < static_cast(original.descriptor().fields().size()); ++column) { - auto &frame_field = original.field(column); - auto field_type = frame_field.type().data_type(); - - if (is_sequence_type(field_type)) { - auto &target = output.column(static_cast(column)).data().buffer(); - size_t end = output.row_count(); - for(auto row = 0u; row < end; ++row) { - auto val = get_string_from_buffer(row, target, original.const_string_pool()); - util::variant_match(val, - [&] (std::string_view sv) { - auto off_str = output.string_pool().get(sv); - set_offset_string_at(row, target, off_str.offset()); - }, - [&] (entity::position_t offset) { - set_offset_string_at(row, target, offset); - }); +/// @brief Find the row range of affected rows during a partial rewrite (on update) +/// During partial rewrite the segment is either affected from the beginning to a +/// certain row or from a certain row to the end. Thus the row range will always be +/// either [0, row) or [row, end). +static RowRange partial_rewrite_row_range( + const SegmentInMemory& segment, + const IndexRange& range, + AffectedSegmentEnd affected_end +) { + if (affected_end == AffectedSegmentEnd::START) { + const timestamp start = std::get(range.start_); + auto bound = std::lower_bound(std::begin(segment), std::end(segment), start, [](const auto& row, timestamp t) { + return row.template index() < t; + }); + return {0, bound->row_pos()}; + } else { + const timestamp end = std::get(range.end_); + auto bound = std::upper_bound(std::begin(segment), std::end(segment), end, [](timestamp t, auto& row) { + return t < row.template index(); + }); + return {bound->row_pos(), segment.row_count()}; + } +} - } - } +/// @brief Find the index range of affected rows during a partial rewrite (on update) +/// Similar to partial_rewrite_row_range the segment is affected either at the beginning +/// or at the end. +static IndexRange partial_rewrite_index_range( + const IndexRange& segment_range, + const IndexRange& update_range, + AffectedSegmentEnd affected_end +) { + if (affected_end == AffectedSegmentEnd::START) { + util::check( + segment_range.start_ < update_range.start_, + "Unexpected index range in after: {} !< {}", + segment_range.start_, + update_range.start_ + ); + return {segment_range.start_, update_range.start_}; + } else { + util::check( + segment_range.end_ > update_range.end_, + "Unexpected non-intersection of update indices: {} !> {}", + segment_range.end_, + update_range.end_ + ); + return {segment_range.end_, update_range.end_}; } } std::optional rewrite_partial_segment( - const SliceAndKey& existing, - IndexRange index_range, - VersionId version_id, - bool before, - const std::shared_ptr& store) { - const auto& key = existing.key(); - const auto& existing_range =key.index_range(); + const SliceAndKey& existing, + IndexRange index_range, + VersionId version_id, + AffectedSegmentEnd affected_end, + const std::shared_ptr& store +) { + const auto& key = existing.key(); + const auto& existing_range = key.index_range(); auto kv = store->read(key).get(); - auto &segment = kv.second; - - auto start = std::get(index_range.start_); - auto end = std::get(index_range.end_); - - if(!before) { - util::check(existing_range.start_ < index_range.start_, "Unexpected index range in after: {} !< {}", existing_range.start_, index_range.start_); - auto bound = std::lower_bound(std::begin(segment), std::end(segment), start, [] ( auto& row, timestamp t) {return row.template index() < t; }); - size_t num_rows = std::distance(std::begin(segment), bound); - if(num_rows == 0) - return std::nullopt; - - auto output = SegmentInMemory{segment.descriptor(), num_rows}; - std::copy(std::begin(segment), bound, std::back_inserter(output)); - update_string_columns(segment, output); - FrameSlice new_slice{ - std::make_shared(output.descriptor()), - existing.slice_.col_range, - RowRange{0, num_rows}, - existing.slice_.hash_bucket(), - existing.slice_.num_buckets()}; - auto fut_key = store->write(key.type(), version_id, key.id(), existing_range.start_, index_range.start_, std::move(output)); - return SliceAndKey{std::move(new_slice), std::get(std::move(fut_key).get())}; - } - else { - util::check(existing_range.end_ > index_range.end_, "Unexpected non-intersection of update indices: {} !> {}", existing_range.end_ , index_range.end_); - - auto bound = std::upper_bound(std::begin(segment), std::end(segment), end, [] ( timestamp t, auto& row) { - return t < row.template index(); - }); - size_t num_rows = std::distance(bound, std::end(segment)); - if(num_rows == 0) - return std::nullopt; - - auto output = SegmentInMemory{segment.descriptor(), num_rows}; - std::copy(bound, std::end(segment), std::back_inserter(output)); - update_string_columns(segment, output); - FrameSlice new_slice{ - std::make_shared(output.descriptor()), - existing.slice_.col_range, - RowRange{0, num_rows}, - existing.slice_.hash_bucket(), - existing.slice_.num_buckets()}; - auto fut_key = store->write(key.type(), version_id, key.id(), index_range.end_, existing_range.end_, std::move(output)); - return SliceAndKey{std::move(new_slice), std::get(std::move(fut_key).get())}; - } + auto& segment = kv.second; + const IndexRange affected_index_range = partial_rewrite_index_range(existing_range, index_range, affected_end); + const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_end); + const size_t num_rows = affected_row_range.end() - affected_row_range.start(); + SegmentInMemory output = segment.truncate(affected_row_range.start(), affected_row_range.end(), true); + const FrameSlice new_slice{ + std::make_shared(output.descriptor()), + existing.slice_.col_range, + RowRange{0, num_rows}, + existing.slice_.hash_bucket(), + existing.slice_.num_buckets()}; + auto fut_key = store->write( + key.type(), + version_id, + key.id(), + affected_index_range.start_, + affected_index_range.end_, + std::move(output) + ); + return SliceAndKey{std::move(new_slice), std::get(std::move(fut_key).get())}; } std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count) { diff --git a/cpp/arcticdb/pipeline/write_frame.hpp b/cpp/arcticdb/pipeline/write_frame.hpp index 52e4b69b6b..bfd5169108 100644 --- a/cpp/arcticdb/pipeline/write_frame.hpp +++ b/cpp/arcticdb/pipeline/write_frame.hpp @@ -62,11 +62,16 @@ folly::Future append_frame( bool ignore_sort_order ); +enum class AffectedSegmentEnd { + START, + END +}; + std::optional rewrite_partial_segment( const SliceAndKey& existing, IndexRange index_range, VersionId version_id, - bool before, + AffectedSegmentEnd affected_end, const std::shared_ptr& store); // TODO: Use std::span when C++20 is enabled diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp index 80899e2191..d78574ad36 100644 --- a/cpp/arcticdb/processing/clause.cpp +++ b/cpp/arcticdb/processing/clause.cpp @@ -799,7 +799,7 @@ Composite RowRangeClause::process(Composite &&entity_ids) if (end_ > row_range->start() && end_ < row_range->end()) { end_row = end_ - (row_range->start()); } - auto truncated_segment = truncate_segment(*proc.segments_->at(idx), start_row, end_row); + auto truncated_segment = proc.segments_->at(idx)->truncate(start_row, end_row, false); auto num_rows = truncated_segment.is_null() ? 0 : truncated_segment.row_count(); proc.row_ranges_->at(idx) = std::make_shared(proc.row_ranges_->at(idx)->first, proc.row_ranges_->at(idx)->first + num_rows); auto num_cols = truncated_segment.is_null() ? 0 : truncated_segment.descriptor().field_count() - truncated_segment.descriptor().index().field_count(); diff --git a/cpp/arcticdb/processing/processing_unit.cpp b/cpp/arcticdb/processing/processing_unit.cpp index 53346dfcaf..f0f98c4eed 100644 --- a/cpp/arcticdb/processing/processing_unit.cpp +++ b/cpp/arcticdb/processing/processing_unit.cpp @@ -34,7 +34,7 @@ void ProcessingUnit::truncate(size_t start_row, size_t end_row) { "ProcessingUnit::truncate requires all of segments, row_ranges, and col_ranges to be present"); for (auto&& [idx, segment]: folly::enumerate(*segments_)) { - auto seg = truncate_segment(*segment, start_row, end_row); + auto seg = segment->truncate(start_row, end_row, false); auto num_rows = seg.is_null() ? 0 : seg.row_count(); row_ranges_->at(idx) = std::make_shared(row_ranges_->at(idx)->first, row_ranges_->at(idx)->first + num_rows); auto num_cols = seg.is_null() ? 0 : seg.descriptor().field_count() - seg.descriptor().index().field_count(); diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index 2f8d343c3a..675a8be1c2 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -5,7 +5,6 @@ * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. */ #include -#include #include #include #include @@ -49,7 +48,7 @@ namespace arcticdb { static inline const PyObject** fill_with_none(const PyObject** dest, size_t count) { auto none = py::none(); - std::generate(dest, dest + count, [&none]() { return none.inc_ref().ptr(); }); + std::generate_n(dest, count, [&none]() { return none.inc_ref().ptr(); }); return dest + count; } @@ -85,9 +84,6 @@ namespace arcticdb { using RawType = typename decltype(tag)::DataTypeTag::raw_type; RawType* ptr_dest = reinterpret_cast(dest); if constexpr (is_sequence_type(tag.data_type())) { - // not_a_string() is set here as strings are offset in the string pool - // later in fix_and_reduce() the offsets are replaced by a real python strings - // TODO: It would be best if the type handler sets this to py::none std::fill_n(ptr_dest, m.num_rows_, arcticdb::not_a_string()); } else if constexpr (is_nullable_type(TypeDescriptor(tag))) { fill_with_none(reinterpret_cast(dest), m.num_rows_); @@ -140,35 +136,30 @@ namespace arcticdb { util::check(field->has_ndarray(), "Bool handler expected array"); ARCTICDB_DEBUG(log::version(), "Bool handler got encoded field: {}", field->DebugString()); auto ptr_dest = reinterpret_cast(dest); - - if(!field->ndarray().sparse_map_bytes()) { - ARCTICDB_DEBUG(log::version(), "Bool handler has no values"); - fill_with_none(ptr_dest, m.num_rows_); - return; - } - - std::optional bv; const auto& ndarray = field->ndarray(); const auto bytes = encoding_sizes::data_uncompressed_size(ndarray); - auto sparse = ChunkedBuffer::presized(bytes); - SliceDataSink sparse_sink{sparse.data(), bytes}; - data += decode_field(m.source_type_desc_, *field, data, sparse_sink, bv, encding_version); - const auto num_bools = bv->count(); - auto ptr_src = sparse.template ptr_cast(0, num_bools * sizeof(uint8_t)); - auto last_row = 0u; - const auto ptr_begin = ptr_dest; - - for(auto en = bv->first(); en < bv->end(); ++en) { - const auto current_pos = *en; - ptr_dest = fill_with_none(ptr_dest, current_pos - last_row); - last_row = current_pos; - auto py_bool = py::bool_(static_cast(*ptr_src++)); - *ptr_dest++ = py_bool.release().ptr(); - ++last_row; - util::check(ptr_begin + last_row == ptr_dest, "Boolean pointer out of alignment {} != {}", last_row, uintptr_t(ptr_begin + last_row)); + ChunkedBuffer decoded_data = ChunkedBuffer::presized(bytes); + SliceDataSink decoded_data_sink{decoded_data.data(), bytes}; + std::optional sparse_map; + data += decode_field(m.source_type_desc_, *field, data, decoded_data_sink, sparse_map, encding_version); + const auto num_bools = sparse_map.has_value() ? sparse_map->count() : m.num_rows_; + auto ptr_src = decoded_data.template ptr_cast(0, num_bools * sizeof(uint8_t)); + if (sparse_map.has_value()) { + ARCTICDB_TRACE(log::codec(), "Bool handler using a sparse map"); + unsigned last_row = 0u; + for (auto en = sparse_map->first(); en < sparse_map->end(); ++en, last_row++) { + const auto current_pos = *en; + ptr_dest = fill_with_none(ptr_dest, current_pos - last_row); + last_row = current_pos; + *ptr_dest++ = py::bool_(static_cast(*ptr_src++)).release().ptr(); + } + fill_with_none(ptr_dest, m.num_rows_ - last_row); + } else { + ARCTICDB_TRACE(log::codec(), "Bool handler didn't find a sparse map. Assuming dense array."); + std::transform(ptr_src, ptr_src + num_bools, ptr_dest, [](uint8_t value) { + return py::bool_(static_cast(value)).release().ptr(); + }); } - fill_with_none(ptr_dest, m.num_rows_ - last_row); - util::check(ptr_begin + last_row == ptr_dest, "Boolean pointer out of alignment at end: {} != {}", last_row, uintptr_t(ptr_begin + last_row)); }, encoded_field_info); } diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index fb304c9a30..328d8429c3 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -210,16 +210,28 @@ inline std::pair, std::vector> intersectin for (const auto& affected_slice_and_key : affected_keys) { const auto& affected_range = affected_slice_and_key.key().index_range(); - if (intersects(affected_range, front_range) && !overlaps(affected_range, front_range) - && is_before(affected_range, front_range)) { - auto front_overlap_key = rewrite_partial_segment(affected_slice_and_key, front_range, version_id, false, store); + if (intersects(affected_range, front_range) && !overlaps(affected_range, front_range) && + is_before(affected_range, front_range)) { + auto front_overlap_key = rewrite_partial_segment( + affected_slice_and_key, + front_range, + version_id, + AffectedSegmentEnd::START, + store + ); if (front_overlap_key) intersect_before.push_back(*front_overlap_key); } if (intersects(affected_range, back_range) && !overlaps(affected_range, back_range) && is_after(affected_range, back_range)) { - auto back_overlap_key = rewrite_partial_segment(affected_slice_and_key, back_range, version_id, true, store); + auto back_overlap_key = rewrite_partial_segment( + affected_slice_and_key, + back_range, + version_id, + AffectedSegmentEnd::END, + store + ); if (back_overlap_key) intersect_after.push_back(*back_overlap_key); } diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 9cccbea962..5d881d39fe 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -227,4 +227,21 @@ def test_date(self, lmdb_version_store_static_and_dynamic): assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype="datetime64[ns]")}) - ) \ No newline at end of file + ) + + +class TestCanUpdateWithEmpty: + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + index = list(pd.date_range(start="1/1/2024", end="1/4/2024")) + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=index) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, dtype=boolean_dtype, index=list(pd.date_range(start="1/2/2024", end="1/3/2024"))) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [True, None, None, True]}, index=index, dtype=boolean_dtype) + ) From 0efb037d305df42ed8202135b86e15fee9690e72 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 30 Jan 2024 16:21:34 +0200 Subject: [PATCH 08/30] Complete tesing for updating with empty type colummn --- cpp/arcticdb/CMakeLists.txt | 1 - cpp/arcticdb/column_store/column.cpp | 2 - cpp/arcticdb/column_store/column.hpp | 4 +- .../column_store/memory_segment_impl.cpp | 19 ++- cpp/arcticdb/pipeline/write_frame.cpp | 21 +-- cpp/arcticdb/pipeline/write_frame.hpp | 4 +- cpp/arcticdb/version/version_core.cpp | 11 +- .../version_store/test_empty_column_type.py | 137 ++++++++++++++++-- 8 files changed, 154 insertions(+), 45 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index 71e347ae62..ea1d9dfdc8 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -230,7 +230,6 @@ set(arcticdb_srcs pipeline/read_pipeline.hpp pipeline/slicing.hpp pipeline/string_pool_utils.hpp - pipeline/string_pool_utils.hpp pipeline/value.hpp pipeline/value_set.hpp pipeline/write_frame.hpp diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp index 4d62a6b4d7..3be9d17f0c 100644 --- a/cpp/arcticdb/column_store/column.cpp +++ b/cpp/arcticdb/column_store/column.cpp @@ -512,8 +512,6 @@ std::vector> Column::split(const std::shared_ptr return output; } -// Produces a new column -// Inclusive of start_row, exclusive of end_row std::shared_ptr Column::truncate(const std::shared_ptr& column, size_t start_row, size_t end_row) { auto type_size = get_type_size(column->type().data_type()); // Bytes from the underlying chunked buffer to include. Inclusive of start_byte, exclusive of end_byte diff --git a/cpp/arcticdb/column_store/column.hpp b/cpp/arcticdb/column_store/column.hpp index b8f69d1599..9ca1d62712 100644 --- a/cpp/arcticdb/column_store/column.hpp +++ b/cpp/arcticdb/column_store/column.hpp @@ -647,7 +647,9 @@ class Column { } static std::vector> split(const std::shared_ptr& column, size_t num_rows); - + /// @biref Produces a new column containing only the data in range [start_row, end_row) + /// @param[in] start_row Inclusive start of the row range + /// @param[in] end_row Exclusive end of the row range static std::shared_ptr truncate(const std::shared_ptr& column, size_t start_row, size_t end_row); private: diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index 4be680acdc..f483b7b4a9 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -446,16 +446,15 @@ std::shared_ptr SegmentInMemoryImpl::truncate( const Field& field = descriptor_->field(idx); if (is_sequence_type(column_type) && reconstruct_string_pool) { output->add_column(field, num_values, true); + const ChunkedBuffer& source = column->data().buffer(); ChunkedBuffer& target = output->column(idx).data().buffer(); - for (size_t row = 0u; row < num_values; ++row) { - util::variant_match( - get_string_from_buffer(row, column->data().buffer(), output->string_pool()), - [&](std::string_view sv) { - OffsetString off_str = output->string_pool().get(sv); - set_offset_string_at(row, target, off_str.offset()); - }, - [&](entity::position_t offset) { set_offset_string_at(row, target, offset); } + for (size_t row = 0; row < num_values; ++row) { + const StringPool::offset_t offset = util::variant_match( + get_string_from_buffer(start_row + row, source, *string_pool_), + [&](std::string_view sv) { return output->string_pool().get(sv).offset(); }, + [](StringPool::offset_t offset) { return offset; } ); + set_offset_string_at(row, target, offset); } } else { auto truncated_column = Column::truncate(column, start_row, end_row); @@ -466,8 +465,8 @@ std::shared_ptr SegmentInMemoryImpl::truncate( return output; } -// Combine 2 segments that hold different columns associated with the same rows -// If unique_column_names is true, any columns from other with names matching those in this are ignored +/// @brief Combine 2 segments that hold different columns associated with the same rows +/// @param[in] unique_column_names If true, any columns from other with names matching those in this are ignored void SegmentInMemoryImpl::concatenate(SegmentInMemoryImpl&& other, bool unique_column_names) { internal::check( row_count() == other.row_count(), diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index 1afc384227..8f895ee07a 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -222,9 +222,9 @@ folly::Future append_frame( static RowRange partial_rewrite_row_range( const SegmentInMemory& segment, const IndexRange& range, - AffectedSegmentEnd affected_end + AffectedSegmentPart affected_end ) { - if (affected_end == AffectedSegmentEnd::START) { + if (affected_end == AffectedSegmentPart::START) { const timestamp start = std::get(range.start_); auto bound = std::lower_bound(std::begin(segment), std::end(segment), start, [](const auto& row, timestamp t) { return row.template index() < t; @@ -245,9 +245,9 @@ static RowRange partial_rewrite_row_range( static IndexRange partial_rewrite_index_range( const IndexRange& segment_range, const IndexRange& update_range, - AffectedSegmentEnd affected_end + AffectedSegmentPart affected_part ) { - if (affected_end == AffectedSegmentEnd::START) { + if (affected_part == AffectedSegmentPart::START) { util::check( segment_range.start_ < update_range.start_, "Unexpected index range in after: {} !< {}", @@ -270,18 +270,18 @@ std::optional rewrite_partial_segment( const SliceAndKey& existing, IndexRange index_range, VersionId version_id, - AffectedSegmentEnd affected_end, + AffectedSegmentPart affected_part, const std::shared_ptr& store ) { const auto& key = existing.key(); - const auto& existing_range = key.index_range(); + const IndexRange& existing_range = key.index_range(); auto kv = store->read(key).get(); - auto& segment = kv.second; - const IndexRange affected_index_range = partial_rewrite_index_range(existing_range, index_range, affected_end); - const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_end); + const SegmentInMemory& segment = kv.second; + const IndexRange affected_index_range = partial_rewrite_index_range(existing_range, index_range, affected_part); + const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_part); const size_t num_rows = affected_row_range.end() - affected_row_range.start(); SegmentInMemory output = segment.truncate(affected_row_range.start(), affected_row_range.end(), true); - const FrameSlice new_slice{ + FrameSlice new_slice{ std::make_shared(output.descriptor()), existing.slice_.col_range, RowRange{0, num_rows}, @@ -300,6 +300,7 @@ std::optional rewrite_partial_segment( std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count) { std::vector output; + output.reserve(groups.size()); global_count = 0; for (const std::vector& group : groups) { if (group.empty()) diff --git a/cpp/arcticdb/pipeline/write_frame.hpp b/cpp/arcticdb/pipeline/write_frame.hpp index bfd5169108..28559823eb 100644 --- a/cpp/arcticdb/pipeline/write_frame.hpp +++ b/cpp/arcticdb/pipeline/write_frame.hpp @@ -62,7 +62,7 @@ folly::Future append_frame( bool ignore_sort_order ); -enum class AffectedSegmentEnd { +enum class AffectedSegmentPart { START, END }; @@ -71,7 +71,7 @@ std::optional rewrite_partial_segment( const SliceAndKey& existing, IndexRange index_range, VersionId version_id, - AffectedSegmentEnd affected_end, + AffectedSegmentPart affected_part, const std::shared_ptr& store); // TODO: Use std::span when C++20 is enabled diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index 328d8429c3..74f258dfe5 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -204,7 +204,8 @@ inline std::pair, std::vector> intersectin const IndexRange& front_range, const IndexRange& back_range, VersionId version_id, - const std::shared_ptr& store) { + const std::shared_ptr& store +) { std::vector intersect_before; std::vector intersect_after; @@ -216,20 +217,20 @@ inline std::pair, std::vector> intersectin affected_slice_and_key, front_range, version_id, - AffectedSegmentEnd::START, + AffectedSegmentPart::START, store ); if (front_overlap_key) intersect_before.push_back(*front_overlap_key); } - if (intersects(affected_range, back_range) && !overlaps(affected_range, back_range) - && is_after(affected_range, back_range)) { + if (intersects(affected_range, back_range) && !overlaps(affected_range, back_range) && + is_after(affected_range, back_range)) { auto back_overlap_key = rewrite_partial_segment( affected_slice_and_key, back_range, version_id, - AffectedSegmentEnd::END, + AffectedSegmentPart::END, store ); if (back_overlap_key) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 5d881d39fe..290ef1cc2d 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -27,8 +27,12 @@ def boolean_dtype(request): yield request.param -class TestCanAppendToEmptyColumn: +@pytest.fixture(params=["datetime64[ns]"]) +def date_dtype(request): + yield request.param + +class TestCanAppendToEmptyColumn: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": [None, None]})) @@ -104,14 +108,38 @@ def test_string(self, lmdb_version_store_static_and_dynamic): df_non_empty ) - def test_date(self, lmdb_version_store_static_and_dynamic): - df_non_empty = pd.DataFrame({"col1": np.array([np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')])}, dtype="datetime64[ns]") + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): + df_non_empty = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ] + ) + }, + dtype=date_dtype + ) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) - expected_result = pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT'), np.datetime64('2005-02'), np.datetime64('2005-03'), np.datetime64('2005-03')], dtype="datetime64[ns]")}) + expected_result = pd.DataFrame( + { + "col1": np.array( + [ + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-03') + ], + dtype=date_dtype + ) + } + ) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[0,2]).data, - pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype="datetime64[ns]")}) + pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[2,5]).data, @@ -120,7 +148,6 @@ def test_date(self, lmdb_version_store_static_and_dynamic): class TestCanAppendEmptyToColumn: - def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): df_initial = pd.DataFrame({"col1": np.array([1,2,3], dtype=int_dtype)}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) @@ -191,7 +218,7 @@ def test_string(self, lmdb_version_store_static_and_dynamic): df_with_none ) - def test_date(self, lmdb_version_store_static_and_dynamic): + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): df_initial = pd.DataFrame( { "col1": np.array( @@ -202,7 +229,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic): ] ) }, - dtype="datetime64[ns]" + dtype=date_dtype ) lmdb_version_store_static_and_dynamic.write("sym", df_initial) lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col1": np.array([None, None])})) @@ -216,7 +243,7 @@ def test_date(self, lmdb_version_store_static_and_dynamic): np.datetime64('NaT'), np.datetime64('NaT') ], - dtype="datetime64[ns]") + dtype=date_dtype) } ) assert_frame_equal(lmdb_version_store_static_and_dynamic.read("sym").data, expected_result) @@ -226,22 +253,104 @@ def test_date(self, lmdb_version_store_static_and_dynamic): ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym", row_range=[3,5]).data, - pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype="datetime64[ns]")}) + pd.DataFrame({"col1": np.array([np.datetime64('NaT'), np.datetime64('NaT')], dtype=date_dtype)}) ) class TestCanUpdateWithEmpty: + def index(self): + return list(pd.date_range(start="1/1/2024", end="1/4/2024")) + + def update_index(self): + return list(pd.date_range(start="1/2/2024", end="1/3/2024")) + + def test_int(self, lmdb_version_store_static_and_dynamic, int_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=int_dtype, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [1, 0, 0, 4]}, index=self.index(), dtype=int_dtype) + ) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": [1, 2, 3, 4]}, dtype=float_dtype, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [1, float("NaN"), float("NaN"), 4]}, index=self.index(), dtype=float_dtype) + ) + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): - index = list(pd.date_range(start="1/1/2024", end="1/4/2024")) lmdb_version_store_static_and_dynamic.write( "sym", - pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=index) + pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=self.index()) ) lmdb_version_store_static_and_dynamic.update( "sym", - pd.DataFrame({"col": [None, None]}, dtype=boolean_dtype, index=list(pd.date_range(start="1/2/2024", end="1/3/2024"))) + pd.DataFrame({"col": [None, None]}, dtype=boolean_dtype, index=self.update_index()) ) assert_frame_equal( lmdb_version_store_static_and_dynamic.read("sym").data, - pd.DataFrame({"col": [True, None, None, True]}, index=index, dtype=boolean_dtype) + pd.DataFrame({"col": [True, None, None, True]}, index=self.index(), dtype=boolean_dtype) + ) + + def test_string(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame({"col": ["a", "longstr"*20, "b", "longstr"*20]}, index=self.index()) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": ["a", None, None, "longstr"*20]}, index=self.index()) + ) + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): + lmdb_version_store_static_and_dynamic.write( + "sym", + pd.DataFrame( + { + "col": [ + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('2005-04'), + np.datetime64('2005-05') + ] + }, + dtype=date_dtype, + index=self.index() + ) + ) + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [None, None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + { + "col": [ + np.datetime64('2005-02'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('2005-05') + ] + }, + index=self.index() + ) ) From f25b788dffd73cfde9250efb8a5711ecd94c12db Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Tue, 30 Jan 2024 17:16:34 +0200 Subject: [PATCH 09/30] Additional test --- .../unit/arcticdb/version_store/test_empty_column_type.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 290ef1cc2d..4618449edc 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -354,3 +354,11 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): index=self.index() ) ) + +class TestEmptyTypesIsOverriden: + def test_cannot_append_different_type_after_first_not_none(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": [None, None]})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [1, 2, 3]})) + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": [None, None]})) + with pytest.raises(Exception): + lmdb_version_store_static_and_dynamic.append("sym", pd.DataFrame({"col": ["some", "string"]})) \ No newline at end of file From 17b642ca52603d75c79f6d713fe42508d32c959c Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 31 Jan 2024 14:33:04 +0200 Subject: [PATCH 10/30] FWD declare buffer holder --- cpp/arcticdb/entity/frame_and_descriptor.hpp | 1 - cpp/arcticdb/pipeline/read_frame.hpp | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/entity/frame_and_descriptor.hpp b/cpp/arcticdb/entity/frame_and_descriptor.hpp index 15f520fcb2..4a6a19a3f3 100644 --- a/cpp/arcticdb/entity/frame_and_descriptor.hpp +++ b/cpp/arcticdb/entity/frame_and_descriptor.hpp @@ -8,7 +8,6 @@ #pragma once #include -#include #include namespace arcticdb { diff --git a/cpp/arcticdb/pipeline/read_frame.hpp b/cpp/arcticdb/pipeline/read_frame.hpp index 0249c91c4d..d0c28348b8 100644 --- a/cpp/arcticdb/pipeline/read_frame.hpp +++ b/cpp/arcticdb/pipeline/read_frame.hpp @@ -8,16 +8,18 @@ #pragma once #include -#include #include #include #include -#include #include #include +namespace arcticdb { + struct BufferHolder; +} + namespace arcticdb::pipelines { SegmentInMemory allocate_frame(const std::shared_ptr& context); From 0334f45e1be4cecf3026fb0b0b32b2f9532d86e5 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 1 Feb 2024 12:40:31 +0200 Subject: [PATCH 11/30] Add a single function to handle default initialization. --- cpp/arcticdb/codec/codec-inl.hpp | 4 --- cpp/arcticdb/pipeline/read_frame.cpp | 5 ++-- cpp/arcticdb/python/python_handlers.cpp | 33 +++++++++++-------------- cpp/arcticdb/python/python_handlers.hpp | 3 +++ cpp/arcticdb/util/sparse_utils.hpp | 23 +++++++++++------ cpp/arcticdb/util/type_handler.cpp | 6 ++--- cpp/arcticdb/util/type_handler.hpp | 17 +++++++------ 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/cpp/arcticdb/codec/codec-inl.hpp b/cpp/arcticdb/codec/codec-inl.hpp index a799def672..1c628cb928 100644 --- a/cpp/arcticdb/codec/codec-inl.hpp +++ b/cpp/arcticdb/codec/codec-inl.hpp @@ -18,10 +18,6 @@ #include -#include - -#include - #include namespace arcticdb { diff --git a/cpp/arcticdb/pipeline/read_frame.cpp b/cpp/arcticdb/pipeline/read_frame.cpp index d62a037d17..69bc82820b 100644 --- a/cpp/arcticdb/pipeline/read_frame.cpp +++ b/cpp/arcticdb/pipeline/read_frame.cpp @@ -476,19 +476,20 @@ void decode_into_frame_dynamic( m.source_type_desc_.visit_tag([&](auto src_desc_tag) { using SourceType = typename decltype(src_desc_tag)::DataTypeTag::raw_type; if constexpr (std::is_arithmetic_v && std::is_arithmetic_v) { - const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; auto dest_ptr = reinterpret_cast(buffer.data() + m.offset_bytes_); if constexpr (is_empty_type(src_desc_tag.data_type())) { + const size_t dest_bytes = m.num_rows_ * sizeof_datatype(m.dest_type_desc_); decode_or_expand( data, reinterpret_cast(dest_ptr), encoded_field, - src_bytes, + dest_bytes, buffers, encdoing_version, m ); } else { + const auto src_bytes = sizeof_datatype(m.source_type_desc_) * m.num_rows_; Buffer tmp_buf{src_bytes}; decode_or_expand( data, diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index 675a8be1c2..fb2450b58d 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace arcticdb { @@ -76,26 +77,8 @@ namespace arcticdb { ); static_assert(get_type_size(DataType::EMPTYVAL) == sizeof(PyObject*)); - // TODO: arcticdb::util::default_initialize does mostly the same thing - // the main difference is that it does not write py::none directly - // * Should use that? - // * It doesn't handle nullable bools and arrays? How should they be handled? m.dest_type_desc_.visit_tag([&](auto tag) { - using RawType = typename decltype(tag)::DataTypeTag::raw_type; - RawType* ptr_dest = reinterpret_cast(dest); - if constexpr (is_sequence_type(tag.data_type())) { - std::fill_n(ptr_dest, m.num_rows_, arcticdb::not_a_string()); - } else if constexpr (is_nullable_type(TypeDescriptor(tag))) { - fill_with_none(reinterpret_cast(dest), m.num_rows_); - } else if constexpr (is_floating_point_type(tag.data_type())) { - std::fill_n(ptr_dest, m.num_rows_, std::numeric_limits::quiet_NaN()); - } else if constexpr (is_integer_type(tag.data_type()) || is_bool_type(tag.data_type())) { - std::memset(dest, 0, sizeof(RawType) * m.num_rows_); - } else if constexpr (is_time_type(tag.data_type())) { - std::fill_n(ptr_dest, m.num_rows_, arcticdb::util::NaT); - } else { - static_assert(sizeof(tag) == 0, "Unhandled data_type"); - } + util::default_initialize(dest, dest_bytes); }); util::variant_match(variant_field, [&input](const auto& field) { @@ -121,6 +104,10 @@ namespace arcticdb { return sizeof(PyObject*); } + void EmptyHandler::default_initialize(void* dest, size_t byte_size) const { + fill_with_none(reinterpret_cast(dest), byte_size / type_size()); + } + void BoolHandler::handle_type( const uint8_t*& data, uint8_t* dest, @@ -167,6 +154,10 @@ namespace arcticdb { return sizeof(PyObject*); } + void BoolHandler::default_initialize(void* dest, size_t byte_size) const { + fill_with_none(reinterpret_cast(dest), byte_size / type_size()); + } + std::mutex ArrayHandler::initialize_array_mutex; void ArrayHandler::handle_type( @@ -241,4 +232,8 @@ namespace arcticdb { int ArrayHandler::type_size() const { return sizeof(PyObject*); } + + void ArrayHandler::default_initialize(void* dest, size_t byte_size) const { + fill_with_none(reinterpret_cast(dest), byte_size / type_size()); + } } diff --git a/cpp/arcticdb/python/python_handlers.hpp b/cpp/arcticdb/python/python_handlers.hpp index ac073ac69d..186c87cbd9 100644 --- a/cpp/arcticdb/python/python_handlers.hpp +++ b/cpp/arcticdb/python/python_handlers.hpp @@ -25,6 +25,7 @@ namespace arcticdb { ); int type_size() const; + void default_initialize(void* dest, size_t byte_size) const; }; struct BoolHandler { @@ -39,6 +40,7 @@ namespace arcticdb { const ColumnMapping& columnMapping ); int type_size() const; + void default_initialize(void* dest, size_t byte_size) const; }; struct DecimalHandler { @@ -66,6 +68,7 @@ namespace arcticdb { const ColumnMapping& columnMapping ); int type_size() const; + void default_initialize(void* dest, size_t byte_size) const; static std::mutex initialize_array_mutex; }; } //namespace arcticdb diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp index 5b17bcfe5a..a7cf86e391 100644 --- a/cpp/arcticdb/util/sparse_utils.hpp +++ b/cpp/arcticdb/util/sparse_utils.hpp @@ -9,6 +9,7 @@ #include #include +#include #include @@ -16,7 +17,6 @@ #include -#include #include namespace arcticdb::util { @@ -68,11 +68,8 @@ inline void expand_dense_buffer_using_bitmap(const util::BitMagic &bv, const uin } } -// TODO: EmptyHandler::handle_type has similar procedure. -// * Should this be used instead? -// * What about nullable booleans and arrays? template -void default_initialize(uint8_t* data, size_t bytes) { +inline void default_initialize(uint8_t* data, size_t bytes) { using RawType = typename TagType::DataTypeTag::raw_type; const auto num_rows ARCTICDB_UNUSED = bytes / sizeof(RawType); constexpr auto data_type = TagType::DataTypeTag::data_type; @@ -80,11 +77,23 @@ void default_initialize(uint8_t* data, size_t bytes) { if constexpr (is_sequence_type(data_type)) { std::fill_n(type_ptr, num_rows, not_a_string()); } else if constexpr (is_floating_point_type(data_type)) { - std::fill_n(type_ptr, num_rows, std::numeric_limits::quiet_NaN()); + std::fill_n(type_ptr, num_rows, std::numeric_limits::quiet_NaN()); } else if constexpr (is_time_type(data_type)) { std::fill_n(type_ptr, num_rows, NaT); + } else if constexpr (is_integer_type(data_type) || is_bool_type(data_type)) { + std::memset(data, 0, bytes); } else { - std::fill_n(data, bytes, 0); + constexpr TypeDescriptor type_descriptor = TagType::type_descriptor(); + const std::shared_ptr& handler = + arcticdb::TypeHandlerRegistry::instance()->get_handler(type_descriptor); + if (handler) { + handler->default_initialize(data, bytes); + } else { + internal::raise( + "Default initialization for {} is not implemented.", + type_descriptor + ); + } } } diff --git a/cpp/arcticdb/util/type_handler.cpp b/cpp/arcticdb/util/type_handler.cpp index 1bf4734e3f..8a3a1d2e41 100644 --- a/cpp/arcticdb/util/type_handler.cpp +++ b/cpp/arcticdb/util/type_handler.cpp @@ -21,16 +21,16 @@ void TypeHandlerRegistry::init() { std::shared_ptr TypeHandlerRegistry::instance_; std::once_flag TypeHandlerRegistry::init_flag_; -std::shared_ptr TypeHandlerRegistry::get_handler(const util::TypeDescriptor& type_descriptor) const { +std::shared_ptr TypeHandlerRegistry::get_handler(const entity::TypeDescriptor& type_descriptor) const { auto it = handlers_.find(type_descriptor); return it == std::end(handlers_) ? std::shared_ptr{} : it->second; } -void TypeHandlerRegistry::register_handler(const util::TypeDescriptor& type_descriptor, TypeHandler&& handler) { +void TypeHandlerRegistry::register_handler(const entity::TypeDescriptor& type_descriptor, TypeHandler&& handler) { handlers_.try_emplace(type_descriptor, std::make_shared(std::move(handler))); } -size_t TypeHandlerRegistry::Hasher::operator()(const util::TypeDescriptor descriptor) const { +size_t TypeHandlerRegistry::Hasher::operator()(const entity::TypeDescriptor descriptor) const { static_assert(sizeof(descriptor) == sizeof(uint16_t), "Cannot compute util::TypeDescriptor's hash. The size is wrong."); static_assert(sizeof(decltype(descriptor.data_type())) == 1); static_assert(sizeof(decltype(descriptor.dimension())) == 1); diff --git a/cpp/arcticdb/util/type_handler.hpp b/cpp/arcticdb/util/type_handler.hpp index 8a3d167bcd..87fcb6296f 100644 --- a/cpp/arcticdb/util/type_handler.hpp +++ b/cpp/arcticdb/util/type_handler.hpp @@ -8,8 +8,6 @@ #pragma once #include -#include -#include #include #include @@ -20,6 +18,7 @@ namespace arcticdb { +struct BufferHolder; struct ITypeHandler { template @@ -53,10 +52,14 @@ struct ITypeHandler { int type_size() const { return folly::poly_call<1>(*this); } + + void default_initialize(void* dest, size_t byte_size) const { + folly::poly_call<2>(*this, dest, byte_size); + } }; template - using Members = folly::PolyMembers<&T::handle_type, &T::type_size>; + using Members = folly::PolyMembers<&T::handle_type, &T::type_size, &T::default_initialize>; }; using TypeHandler = folly::Poly; @@ -71,13 +74,13 @@ class TypeHandlerRegistry { static void init(); static std::shared_ptr instance(); - std::shared_ptr get_handler(const util::TypeDescriptor& type_descriptor) const; - void register_handler(const util::TypeDescriptor& type_descriptor, TypeHandler&& handler); + std::shared_ptr get_handler(const entity::TypeDescriptor& type_descriptor) const; + void register_handler(const entity::TypeDescriptor& type_descriptor, TypeHandler&& handler); private: struct Hasher { - size_t operator()(const util::TypeDescriptor val) const; + size_t operator()(const entity::TypeDescriptor val) const; }; - std::unordered_map, Hasher> handlers_; + std::unordered_map, Hasher> handlers_; }; } From e3d71cbe19579c2ed0d79dcf3b3e67c0bafe6049 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 1 Feb 2024 14:45:30 +0200 Subject: [PATCH 12/30] Change pybool type name so a language agnostic name bool_object --- cpp/arcticdb/column_store/column_utils.hpp | 5 ++--- cpp/arcticdb/entity/type_utils.cpp | 2 +- cpp/arcticdb/entity/types-inl.hpp | 4 ++-- cpp/arcticdb/entity/types.cpp | 2 +- cpp/arcticdb/entity/types.hpp | 12 ++++++------ cpp/arcticdb/pipeline/frame_utils.hpp | 2 +- cpp/arcticdb/processing/aggregation.cpp | 4 ++-- cpp/arcticdb/python/python_handlers.cpp | 4 ---- cpp/arcticdb/python/python_module.cpp | 2 +- cpp/arcticdb/python/python_to_tensor_frame.cpp | 2 +- cpp/proto/arcticc/pb2/descriptors.proto | 2 +- 11 files changed, 18 insertions(+), 23 deletions(-) diff --git a/cpp/arcticdb/column_store/column_utils.hpp b/cpp/arcticdb/column_store/column_utils.hpp index 351617a62b..90a35a1048 100644 --- a/cpp/arcticdb/column_store/column_utils.hpp +++ b/cpp/arcticdb/column_store/column_utils.hpp @@ -7,7 +7,6 @@ #pragma once -#include #include #include @@ -44,7 +43,7 @@ inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, py: } else { dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize); } - } else if constexpr (is_empty_type(data_type) || is_py_bool_type(data_type) || is_numpy_array(TypeDescriptor(tag))) { + } else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type) || is_numpy_array(TypeDescriptor(tag))) { dtype= "O"; // The python representation of multidimensional columns differs from the in-memory/on-storage. In memory, // we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there @@ -94,7 +93,7 @@ inline py::array array_at(const SegmentInMemory& frame, std::size_t col_pos, py: } else { dtype = fmt::format("{}{:d}", get_dtype_specifier(data_type), esize); } - } else if constexpr (is_empty_type(data_type) || is_py_bool_type(data_type) || is_numpy_array(TypeDescriptor(tag))){ + } else if constexpr (is_empty_type(data_type) || is_bool_object_type(data_type) || is_numpy_array(TypeDescriptor(tag))){ dtype = "O"; // The python representation of multidimensional columns differs from the in-memory/on-storage. In memory, // we hold all scalars in a contiguous buffer with the shapes buffer telling us how many elements are there diff --git a/cpp/arcticdb/entity/type_utils.cpp b/cpp/arcticdb/entity/type_utils.cpp index 715939590d..20fbb0cd05 100644 --- a/cpp/arcticdb/entity/type_utils.cpp +++ b/cpp/arcticdb/entity/type_utils.cpp @@ -120,7 +120,7 @@ namespace arcticdb { // Only allow promotion with UTF strings, and only to dynamic (never to fixed width) if (!is_utf_type(source_type) || !is_utf_type(target_type) || !is_dynamic_string_type(target_type)) return std::nullopt; - } else if (is_py_bool_type(source_type)) { + } else if (is_bool_object_type(source_type)) { return std::nullopt; } else { // Non-numeric source type diff --git a/cpp/arcticdb/entity/types-inl.hpp b/cpp/arcticdb/entity/types-inl.hpp index cb16fafbdf..671eee0a1d 100644 --- a/cpp/arcticdb/entity/types-inl.hpp +++ b/cpp/arcticdb/entity/types-inl.hpp @@ -39,7 +39,7 @@ constexpr auto visit_dim(DataType dt, Callable &&c) { DT_CASE(UTF_FIXED64) DT_CASE(UTF_DYNAMIC64) DT_CASE(EMPTYVAL) - DT_CASE(PYBOOL8) + DT_CASE(BOOL_OBJECT8) #undef DT_CASE default: util::raise_rte("Invalid dtype '{}' in visit dim", datatype_to_str(dt)); } @@ -67,7 +67,7 @@ auto visit_type(DataType dt, Callable &&c) { DT_CASE(UTF_FIXED64) DT_CASE(UTF_DYNAMIC64) DT_CASE(EMPTYVAL) - DT_CASE(PYBOOL8) + DT_CASE(BOOL_OBJECT8) #undef DT_CASE default: util::raise_rte("Invalid dtype '{}' in visit type", datatype_to_str(dt)); } diff --git a/cpp/arcticdb/entity/types.cpp b/cpp/arcticdb/entity/types.cpp index 4f7593b23a..a03e45cdfd 100644 --- a/cpp/arcticdb/entity/types.cpp +++ b/cpp/arcticdb/entity/types.cpp @@ -37,7 +37,7 @@ std::string_view datatype_to_str(const DataType dt) { TO_STR(UTF_FIXED64) TO_STR(UTF_DYNAMIC64) TO_STR(EMPTYVAL) - TO_STR(PYBOOL8) + TO_STR(BOOL_OBJECT8) #undef TO_STR default:return std::string_view("UNKNOWN"); } diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index 2a16a30df1..ca062495f1 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -128,7 +128,7 @@ enum class ValueType : uint8_t { // When data is appended, the column type is inferred from the data and the column is promoted to the inferred type. EMPTY = 13, /// Nullable booleans - PYBOOL = 14, + BOOL_OBJECT = 14, COUNT // Not a real value type, should not be added to proto descriptor. Used to count the number of items in the enum }; @@ -225,7 +225,7 @@ enum class DataType : uint8_t { UTF_FIXED64 = detail::combine_val_bits(ValueType::UTF8_FIXED, SizeBits::S64), UTF_DYNAMIC64 = detail::combine_val_bits(ValueType::UTF_DYNAMIC, SizeBits::S64), EMPTYVAL = detail::combine_val_bits(ValueType::EMPTY, SizeBits::S64), - PYBOOL8 = detail::combine_val_bits(ValueType::PYBOOL, SizeBits::S8), + BOOL_OBJECT8 = detail::combine_val_bits(ValueType::BOOL_OBJECT, SizeBits::S8), UNKNOWN = 0, }; @@ -279,8 +279,8 @@ constexpr bool is_bool_type(DataType dt) { return slice_value_type(dt) == ValueType::BOOL; } -constexpr bool is_py_bool_type(DataType dt) { - return slice_value_type(dt) == ValueType::PYBOOL; +constexpr bool is_bool_object_type(DataType dt) { + return slice_value_type(dt) == ValueType::BOOL_OBJECT; } constexpr bool is_unsigned_type(DataType dt) { @@ -410,7 +410,7 @@ DATA_TYPE_TAG(ASCII_DYNAMIC64, std::uint64_t) DATA_TYPE_TAG(UTF_FIXED64, std::uint64_t) DATA_TYPE_TAG(UTF_DYNAMIC64, std::uint64_t) DATA_TYPE_TAG(EMPTYVAL, std::uint64_t) -DATA_TYPE_TAG(PYBOOL8, uint8_t) +DATA_TYPE_TAG(BOOL_OBJECT8, uint8_t) #undef DATA_TYPE_TAG enum class Dimension : uint8_t { @@ -525,7 +525,7 @@ constexpr bool is_numpy_array(TypeDescriptor td) { } constexpr bool is_pyobject_type(TypeDescriptor td) { - return is_dynamic_string_type(slice_value_type(td.data_type())) || is_py_bool_type(td.data_type()) || + return is_dynamic_string_type(slice_value_type(td.data_type())) || is_bool_object_type(td.data_type()) || is_numpy_array(td); } diff --git a/cpp/arcticdb/pipeline/frame_utils.hpp b/cpp/arcticdb/pipeline/frame_utils.hpp index c764063509..c95e020c1a 100644 --- a/cpp/arcticdb/pipeline/frame_utils.hpp +++ b/cpp/arcticdb/pipeline/frame_utils.hpp @@ -195,7 +195,7 @@ std::optional aggregator_set_data( agg.set_array(col, t); } } - } else if constexpr(is_py_bool_type(dt)) { + } else if constexpr(is_bool_object_type(dt)) { normalization::check(tag.dimension() == Dimension::Dim0, "Multidimensional nullable booleans are not supported"); auto data = const_cast(tensor.data()); diff --git a/cpp/arcticdb/processing/aggregation.cpp b/cpp/arcticdb/processing/aggregation.cpp index 14368b8edf..4bc7ee46c5 100644 --- a/cpp/arcticdb/processing/aggregation.cpp +++ b/cpp/arcticdb/processing/aggregation.cpp @@ -139,8 +139,8 @@ namespace }; template<> - struct OutputType, void> { - using type = ScalarTagType>; + struct OutputType, void> { + using type = ScalarTagType>; }; } diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index fb2450b58d..e17f10274f 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -53,10 +53,6 @@ namespace arcticdb { return dest + count; } - constexpr inline bool is_nullable_type(TypeDescriptor td) { - return td.dimension() > Dimension::Dim0 || is_empty_type(td.data_type()) || is_py_bool_type(td.data_type()); - } - void EmptyHandler::handle_type( const uint8_t*& input, uint8_t* dest, diff --git a/cpp/arcticdb/python/python_module.cpp b/cpp/arcticdb/python/python_module.cpp index cc061dc5fe..762a74cb1b 100644 --- a/cpp/arcticdb/python/python_module.cpp +++ b/cpp/arcticdb/python/python_module.cpp @@ -291,7 +291,7 @@ void register_type_handlers() { for(const DataType& data_type : allowed_array_types) { TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{data_type, Dimension::Dim1}, arcticdb::ArrayHandler()); } - TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{DataType::PYBOOL8, Dimension::Dim0}, arcticdb::BoolHandler()); + TypeHandlerRegistry::instance()->register_handler(TypeDescriptor{DataType::BOOL_OBJECT8, Dimension::Dim0}, arcticdb::BoolHandler()); } PYBIND11_MODULE(arcticdb_ext, m) { diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index ba7bf1e4d5..a4be90188c 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -49,7 +49,7 @@ std::variant pystring_to_buffer(PyObject * std::tuple determine_python_object_type(PyObject* obj) { if(is_py_boolean(obj)) - return {ValueType::PYBOOL, 1, 1}; + return {ValueType::BOOL_OBJECT, 1, 1}; return {ValueType::BYTES, 8, 1}; } diff --git a/cpp/proto/arcticc/pb2/descriptors.proto b/cpp/proto/arcticc/pb2/descriptors.proto index 17e0bd0b2a..0dcf769a35 100644 --- a/cpp/proto/arcticc/pb2/descriptors.proto +++ b/cpp/proto/arcticc/pb2/descriptors.proto @@ -31,7 +31,7 @@ message TypeDescriptor { DYNAMIC_STRING = 11; // offset reference to deduplicated string pool DYNAMIC_UTF8 = 12; EMPTY = 13; - PYBOOL = 14; // Nullable boolean + BOOL_OBJECT = 14; // Nullable boolean } enum SizeBits { From 868c60779f24800de20cbd5ddf431ce2f1357c99 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 1 Feb 2024 17:06:10 +0200 Subject: [PATCH 13/30] Skip nullable booleans in empty types tests --- .../tests/unit/arcticdb/version_store/test_empty_column_type.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 4618449edc..a999dc2eee 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -24,6 +24,8 @@ def float_dtype(request): # object is for nullable boolean @pytest.fixture(params=["object", "bool"]) def boolean_dtype(request): + if request.param == "object": + pytest.skip("Nullable booleans are temporary disabled") yield request.param From 5f244cad6ea881a49e46e72ff28223c8a4312e9c Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 1 Feb 2024 17:56:07 +0200 Subject: [PATCH 14/30] Fix compiler warnings --- cpp/arcticdb/pipeline/write_frame.cpp | 2 +- python/test.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 python/test.py diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index 0268cef4f0..e89356295f 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -262,7 +262,7 @@ folly::Future append_frame( return index::write_index(stream::index_type_from_descriptor(frame->desc), std::move(tsd), std::move(slices_to_write), key, store); } else { const FieldCollection& new_fields{index_segment_reader.tsd().fields()}; - for (int i = 0; i < new_fields.size(); ++i) { + for (size_t i = 0; i < new_fields.size(); ++i) { const Field& new_field = new_fields.at(i); TypeDescriptor& original_type = frame->desc.mutable_field(i).mutable_type(); if (is_empty_type(original_type.data_type()) && !is_empty_type(new_field.type().data_type())) { diff --git a/python/test.py b/python/test.py deleted file mode 100644 index 29127d28a3..0000000000 --- a/python/test.py +++ /dev/null @@ -1,7 +0,0 @@ -from arcticdb import Arctic -import pandas as pd -import numpy as np -ac = Arctic("lmdb://test") -l = ac.get_library("test", create_if_missing=True) -df = pd.DataFrame({"col": [1,2,3]}) -l.write("test", df) \ No newline at end of file From 3d8c784e79890b2c1a7143a442510a3c350f639d Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 2 Feb 2024 12:19:44 +0200 Subject: [PATCH 15/30] Work on matching compiler flags --- cpp/arcticdb/CMakeLists.txt | 9 ++++++- cpp/arcticdb/codec/zstd.hpp | 35 ++++++++++++++----------- cpp/arcticdb/processing/aggregation.cpp | 12 ++++++--- cpp/arcticdb/python/python_handlers.cpp | 9 +++---- cpp/arcticdb/util/sparse_utils.hpp | 2 +- cpp/arcticdb/util/trace.cpp | 8 ++---- cpp/arcticdb/util/trace.hpp | 6 +---- 7 files changed, 43 insertions(+), 38 deletions(-) diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index f647766e07..4466c11a88 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -157,7 +157,7 @@ ENDIF() set(CMAKE_INSTALL_RPATH "\\$ORIGIN/lib") -if (NOT WIN32) +if (NOT MSVC) if(NOT ${ARCTICDB_USING_CONDA}) # Compilers on conda-forge are relatively strict. # For now, we do not want to treat warnings as errors for those builds. @@ -166,6 +166,13 @@ if (NOT WIN32) add_compile_options("-ggdb") add_compile_options("-fvisibility=hidden") add_compile_options("-fno-omit-frame-pointer") +else() + add_compile_options( + /WX + /wd4250 # Suppress inherits via dominance warning + /we4189 # Unused local variable is an error + /we4100 # Unused function parameter is an error + ) endif () ## Core library without python bindings ## diff --git a/cpp/arcticdb/codec/zstd.hpp b/cpp/arcticdb/codec/zstd.hpp index 712c26543d..4f69ff07d2 100644 --- a/cpp/arcticdb/codec/zstd.hpp +++ b/cpp/arcticdb/codec/zstd.hpp @@ -51,28 +51,31 @@ struct ZstdBlockEncoder { struct ZstdDecoder { -/* - * encoder_version is here to support multiple versions but won't be used before we have them - */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" - - template + /// @param[in] encoder_version Used to support multiple versions but won't be used before we have them + template static void decode_block( - std::uint32_t encoder_version, - const std::uint8_t *in, + [[maybe_unused]] std::uint32_t encoder_version, + const std::uint8_t* in, std::size_t in_bytes, - T *t_out, - std::size_t out_bytes) { + T* t_out, + std::size_t out_bytes + ) { const std::size_t decomp_size = ZSTD_getFrameContentSize(in, in_bytes); - codec::check(decomp_size == out_bytes, "expected out_bytes == zstd deduced bytes, actual {} != {}", - out_bytes, decomp_size); + codec::check( + decomp_size == out_bytes, + "expected out_bytes == zstd deduced bytes, actual {} != {}", + out_bytes, + decomp_size + ); std::size_t real_decomp = ZSTD_decompress(t_out, out_bytes, in, in_bytes); - codec::check(real_decomp == out_bytes, "expected out_bytes == zstd decompressed bytes, actual {} != {}", - out_bytes, real_decomp); + codec::check( + real_decomp == out_bytes, + "expected out_bytes == zstd decompressed bytes, actual {} != {}", + out_bytes, + real_decomp + ); } -#pragma GCC diagnostic pop }; } // namespace arcticdb::detail diff --git a/cpp/arcticdb/processing/aggregation.cpp b/cpp/arcticdb/processing/aggregation.cpp index 4bc7ee46c5..87905011a3 100644 --- a/cpp/arcticdb/processing/aggregation.cpp +++ b/cpp/arcticdb/processing/aggregation.cpp @@ -173,13 +173,13 @@ void SumAggregatorData::aggregate(const std::optional& input_ if (!data_type_.has_value() || *data_type_ == DataType::EMPTYVAL) { data_type_ = DataType::FLOAT64; } - entity::details::visit_type(*data_type_, [&input_column, unique_values, &groups, that=this] (auto global_type_desc_tag) { + entity::details::visit_type(*data_type_, [&input_column, unique_values, &groups, this] (auto global_type_desc_tag) { using GlobalInputType = decltype(global_type_desc_tag); if constexpr(!is_sequence_type(GlobalInputType::DataTypeTag::data_type)) { using GlobalTypeDescriptorTag = typename OutputType::type; using GlobalRawType = typename GlobalTypeDescriptorTag::DataTypeTag::raw_type; - that->aggregated_.resize(sizeof(GlobalRawType)* unique_values); - auto out_ptr = reinterpret_cast(that->aggregated_.data()); + aggregated_.resize(sizeof(GlobalRawType)* unique_values); + auto out_ptr = reinterpret_cast(aggregated_.data()); if (input_column.has_value()) { entity::details::visit_type(input_column->column_->type().data_type(), [&input_column, &groups, &out_ptr] (auto type_desc_tag) { using ColumnTagType = std::decay_t; @@ -190,7 +190,11 @@ void SumAggregatorData::aggregate(const std::optional& input_ while (auto block = col_data.next>>()) { auto ptr = reinterpret_cast(block.value().data()); for (auto i = 0u; i < block.value().row_count(); ++i, ++ptr, ++iter) { - out_ptr[groups[deref(iter)]] += GlobalRawType(*ptr); + if constexpr (std::is_same_v) { + out_ptr[groups[deref(iter)]] |= GlobalRawType(*ptr); + } else { + out_ptr[groups[deref(iter)]] += GlobalRawType(*ptr); + } } } }; diff --git a/cpp/arcticdb/python/python_handlers.cpp b/cpp/arcticdb/python/python_handlers.cpp index e17f10274f..4e9aca7f33 100644 --- a/cpp/arcticdb/python/python_handlers.cpp +++ b/cpp/arcticdb/python/python_handlers.cpp @@ -108,7 +108,7 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field_info, - size_t dest_bytes, + size_t, std::shared_ptr, EncodingVersion encding_version, const ColumnMapping& m @@ -160,7 +160,7 @@ namespace arcticdb { const uint8_t*& data, uint8_t* dest, const VariantField& encoded_field_info, - size_t dest_bytes, + size_t, std::shared_ptr buffers, EncodingVersion encoding_version, const ColumnMapping& m @@ -168,12 +168,11 @@ namespace arcticdb { util::variant_match(encoded_field_info, [&](auto field){ ARCTICDB_SAMPLE(HandleArray, 0) util::check(field->has_ndarray(), "Expected ndarray in array object handler"); - const auto row_count = dest_bytes / sizeof(PyObject*); auto ptr_dest = reinterpret_cast(dest); if(!field->ndarray().sparse_map_bytes()) { log::version().info("Array handler has no values"); - fill_with_none(ptr_dest, row_count); + fill_with_none(ptr_dest, m.num_rows_); return; } std::shared_ptr column = buffers->get_buffer(m.source_type_desc_, true); @@ -221,7 +220,7 @@ namespace arcticdb { }); ARCTICDB_SUBSAMPLE(ArrayIncNones, 0) - fill_with_none(ptr_dest, row_count - last_row); + fill_with_none(ptr_dest, m.num_rows_ - last_row); }); } diff --git a/cpp/arcticdb/util/sparse_utils.hpp b/cpp/arcticdb/util/sparse_utils.hpp index a020dd2abb..13bb5a78b0 100644 --- a/cpp/arcticdb/util/sparse_utils.hpp +++ b/cpp/arcticdb/util/sparse_utils.hpp @@ -107,7 +107,7 @@ ChunkedBuffer scan_floating_point_to_sparse( util::BitMagic& block_bitset) { auto scan_ptr = ptr; for (size_t idx = 0; idx < rows_to_write; ++idx, ++scan_ptr) { - block_bitset[bv_size(idx)] = isnan(*scan_ptr) <= 0; + block_bitset[bv_size(idx)] = !isnan(*scan_ptr); } const auto bytes = block_bitset.count() * sizeof(RawType); diff --git a/cpp/arcticdb/util/trace.cpp b/cpp/arcticdb/util/trace.cpp index 41f14e050d..5b1c6991a0 100644 --- a/cpp/arcticdb/util/trace.cpp +++ b/cpp/arcticdb/util/trace.cpp @@ -11,20 +11,16 @@ #include #endif -#include - namespace arcticdb { - std::string get_type_name(const std::type_info & ti){ + std::string get_type_name(const std::type_info& ti){ #ifndef _WIN32 char* demangled = abi::__cxa_demangle(ti.name(), nullptr, nullptr, nullptr); std::string ret = demangled; free(demangled); return ret; #else - //TODO: Implement get_name_type for windows - return std::string(""); + return ti.name(); #endif } - } diff --git a/cpp/arcticdb/util/trace.hpp b/cpp/arcticdb/util/trace.hpp index 17bfbf7ed0..146c9c9c91 100644 --- a/cpp/arcticdb/util/trace.hpp +++ b/cpp/arcticdb/util/trace.hpp @@ -8,14 +8,10 @@ #pragma once #include -#include -#include -#include -#include #include namespace arcticdb { -std::string get_type_name(const std::type_info & ); +std::string get_type_name(const std::type_info& type_info); } From 8d0484ad8247c8af2ed29ae850885dfc7cc0209e Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 2 Feb 2024 19:42:00 +0200 Subject: [PATCH 16/30] Add tests for updating an empty column --- cpp/arcticdb/codec/codec.cpp | 20 ++- cpp/arcticdb/column_store/column.cpp | 126 +++++++++------ cpp/arcticdb/column_store/column.hpp | 22 +-- .../version_store/test_empty_column_type.py | 151 +++++++++++++++++- 4 files changed, 257 insertions(+), 62 deletions(-) diff --git a/cpp/arcticdb/codec/codec.cpp b/cpp/arcticdb/codec/codec.cpp index 4d0edd4e8d..395479502c 100644 --- a/cpp/arcticdb/codec/codec.cpp +++ b/cpp/arcticdb/codec/codec.cpp @@ -389,14 +389,26 @@ void decode_v1( for (int i = 0; i < fields_size; ++i) { const auto& field = hdr.fields(i); const auto& field_name = desc.fields(i).name(); - util::check(data!=end, "Reached end of input block with {} fields to decode", fields_size-i); - if(auto col_index = res.column_index(field_name)) { + if (auto col_index = res.column_index(field_name)) { auto& col = res.column(static_cast(*col_index)); - data += decode_field(res.field(*col_index).type(), field, data, col, col.opt_sparse_map(), to_encoding_version(hdr.encoding_version())); + util::check( + data != end || is_empty_type(col.type().data_type()), + "Reached end of input block with {} fields to decode", + fields_size - i + ); + data += decode_field( + res.field(*col_index).type(), + field, + data, + col, + col.opt_sparse_map(), + to_encoding_version(hdr.encoding_version()) + ); } else { + util::check(data != end, "Reached end of input block with {} fields to decode", fields_size - i); data += encoding_sizes::field_compressed_size(field); } - ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", i, data-begin); + ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", i, data - begin); } decode_string_pool(hdr, data, begin, end, res); diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp index 3be9d17f0c..c5ccc192ba 100644 --- a/cpp/arcticdb/column_store/column.cpp +++ b/cpp/arcticdb/column_store/column.cpp @@ -512,63 +512,89 @@ std::vector> Column::split(const std::shared_ptr return output; } -std::shared_ptr Column::truncate(const std::shared_ptr& column, size_t start_row, size_t end_row) { - auto type_size = get_type_size(column->type().data_type()); - // Bytes from the underlying chunked buffer to include. Inclusive of start_byte, exclusive of end_byte - size_t start_byte; - size_t end_byte; - util::BitMagic input_sparse_map; - if (column->is_sparse()) { - input_sparse_map = column->sparse_map(); - internal::check(input_sparse_map.size() > 0, "Unexpected empty sparse map in Column::truncate"); - // Sparse columns do not include trailing 0s in the bitset, so the relevant end_row is capped at the size of the biset +/// Bytes from the underlying chunked buffer to include when truncating. Inclusive of start_byte, exclusive of end_byte +static [[nodiscard]] std::pair column_start_end_bytes( + const Column& column, + size_t start_row, + size_t end_row +) { + const size_t type_size = get_type_size(column.type().data_type()); + size_t start_byte = start_row * type_size; + size_t end_byte = end_row * type_size; + if (column.is_sparse()) { + const util::BitMagic& input_sparse_map = column.sparse_map(); + internal::check( + input_sparse_map.size() > 0, + "Unexpected empty sparse map in Column::truncate" + ); + // Sparse columns do not include trailing 0s in the bitset, so the relevant end_row is capped at the size of the + // biset end_row = std::min(end_row, static_cast(input_sparse_map.size())); // count_range is inclusive at both ends - auto set_bits_before_range = start_row > 0 ? input_sparse_map.count_range(0, start_row - 1) : 0; - auto set_bits_in_range = input_sparse_map.count_range(start_row, end_row - 1); + const size_t set_bits_before_range = start_row > 0 ? input_sparse_map.count_range(0, start_row - 1) : 0; + const size_t set_bits_in_range = input_sparse_map.count_range(start_row, end_row - 1); start_byte = set_bits_before_range * type_size; end_byte = start_byte + (set_bits_in_range * type_size); - } else { - start_byte = start_row * type_size; - end_byte = end_row * type_size; } internal::check( - start_byte % type_size == 0, - "start_byte {} is not a multiple of type size {}", start_byte, column->type()); + start_byte % type_size == 0, + "start_byte {} is not a multiple of type size {}", + start_byte, + column.type() + ); internal::check( - end_byte % type_size == 0, - "start_byte {} is not a multiple of type size {}", end_byte, column->type()); + end_byte % type_size == 0, + "start_byte {} is not a multiple of type size {}", + end_byte, + column.type() + ); + return {start_byte, end_byte}; +} + +static [[nodiscard]] util::BitMagic truncate_sparse_map( + const util::BitMagic& input_sparse_map, + size_t start_row, + size_t end_row +) { + // The output sparse map is the slice [start_row, end_row) of the input sparse map + // BitMagic doesn't have a method for this, so hand-roll it here + // Ctor parameter is the size + util::BitMagic output_sparse_map(end_row - start_row); + util::BitSet::bulk_insert_iterator inserter(output_sparse_map); + util::BitSetSizeType set_input_bit; + if (start_row == 0) { + // get_first can return 0 if no bits are set, but we checked earlier that input_sparse_map.size() > 0, + // and we do not have sparse maps with no bits set (except for the empty type) + set_input_bit = input_sparse_map.get_first(); + if (set_input_bit < end_row) { + inserter = set_input_bit; + } + } else { + set_input_bit = input_sparse_map.get_next(start_row - 1); + // get_next returns 0 if no more bits are set + if (set_input_bit != 0 && set_input_bit < end_row) { + // Shift start_row elements to the left + inserter = set_input_bit - start_row; + } + } + do { + set_input_bit = input_sparse_map.get_next(set_input_bit); + if (set_input_bit != 0 && set_input_bit < end_row) { + inserter = set_input_bit - start_row; + } + } while (set_input_bit != 0); + inserter.flush(); + return output_sparse_map; +} + +std::shared_ptr Column::truncate(const std::shared_ptr& column, size_t start_row, size_t end_row) { + const auto [start_byte, end_byte] = column_start_end_bytes(*column, start_row, end_row); auto buffer = ::arcticdb::truncate(column->data_.buffer(), start_byte, end_byte); auto res = std::make_shared(column->type(), column->allow_sparse_, std::move(buffer)); if (column->is_sparse()) { - // The output sparse map is the slice [start_row, end_row) of the input sparse map - // BitMagic doesn't have a method for this, so hand-roll it here - // Ctor parameter is the size - util::BitMagic output_sparse_map(end_row - start_row); - util::BitSet::bulk_insert_iterator inserter(output_sparse_map); - util::BitSetSizeType set_input_bit; - if (start_row == 0) { - // get_first can return 0 if no bits are set, but we checked earlier that input_sparse_map.size() > 0, - // and we do not have sparse maps with no bits set - set_input_bit = input_sparse_map.get_first(); - if (set_input_bit < end_row) { - inserter = set_input_bit; - } - } else { - set_input_bit = input_sparse_map.get_next(start_row - 1); - // get_next returns 0 if no more bits are set - if (set_input_bit != 0 && set_input_bit < end_row) { - // Shift start_row elements to the left - inserter = set_input_bit - start_row; - } - } - do { - set_input_bit = input_sparse_map.get_next(set_input_bit); - if (set_input_bit != 0 && set_input_bit < end_row) { - inserter = set_input_bit - start_row; - } - } while (set_input_bit != 0); - inserter.flush(); + util::BitMagic output_sparse_map = is_empty_type(column->type().data_type()) + ? util::BitMagic{} + : truncate_sparse_map(column->sparse_map(), start_row, end_row); res->set_sparse_map(std::move(output_sparse_map)); } res->set_row_data(end_row - (start_row + 1)); @@ -672,7 +698,7 @@ void Column::regenerate_offsets() const { } } -[[nodiscard]] util::BitMagic& Column::sparse_map() { +util::BitMagic& Column::sparse_map() { if(!sparse_map_) sparse_map_ = std::make_optional(0); @@ -684,4 +710,8 @@ const util::BitMagic& Column::sparse_map() const { return sparse_map_.value(); } +std::optional& Column::opt_sparse_map() { + return sparse_map_; +} + } //namespace arcticdb diff --git a/cpp/arcticdb/column_store/column.hpp b/cpp/arcticdb/column_store/column.hpp index 9ca1d62712..03bcf65721 100644 --- a/cpp/arcticdb/column_store/column.hpp +++ b/cpp/arcticdb/column_store/column.hpp @@ -239,9 +239,9 @@ class Column { sparse_map().set_range(0, bv_size(to_row), true); } - std::optional& opt_sparse_map() { - return sparse_map_; - } + [[nodiscard]] util::BitMagic& sparse_map(); + [[nodiscard]] const util::BitMagic& sparse_map() const; + [[nodiscard]] std::optional& opt_sparse_map(); template auto begin() const { @@ -646,11 +646,18 @@ class Column { return *res; } - static std::vector> split(const std::shared_ptr& column, size_t num_rows); - /// @biref Produces a new column containing only the data in range [start_row, end_row) + [[nodiscard]] static std::vector> split( + const std::shared_ptr& column, + size_t num_rows + ); + /// @brief Produces a new column containing only the data in range [start_row, end_row) /// @param[in] start_row Inclusive start of the row range /// @param[in] end_row Exclusive end of the row range - static std::shared_ptr truncate(const std::shared_ptr& column, size_t start_row, size_t end_row); + [[nodiscard]] static std::shared_ptr truncate( + const std::shared_ptr& column, + size_t start_row, + size_t end_row + ); private: @@ -670,9 +677,6 @@ class Column { // Permutes the physical column storage based on the given sorted_pos. void physical_sort_external(std::vector &&sorted_pos); - [[nodiscard]] util::BitMagic& sparse_map(); - const util::BitMagic& sparse_map() const; - // Members CursoredBuffer data_; CursoredBuffer shapes_; diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index a999dc2eee..58cccc2cb4 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -37,7 +37,7 @@ def date_dtype(request): class TestCanAppendToEmptyColumn: @pytest.fixture(autouse=True) def create_empty_column(self, lmdb_version_store_static_and_dynamic): - lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": [None, None]})) + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col1": 2 * [None]})) yield def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): @@ -357,6 +357,155 @@ def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): ) ) + +class TestCanUpdateEmpty: + """ + Check that a column containing only empty type and date index can be updated and that + the type of the column will be changed to the type of the new data. The update range + is right in the middle of the initial date range, so there will be values unaffected + by the update. Check that on read the type of the unaffected entries will be changed + from empty to the new type of the column. + """ + + + def index(self): + return list(pd.date_range(start="1/1/2024", end="1/4/2024")) + + def update_index(self): + return list(pd.date_range(start="1/2/2024", end="1/3/2024")) + + @pytest.fixture(autouse=True) + def create_empty_column(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": 4 * [None]}, index=self.index())) + yield + + def test_integer(self, lmdb_version_store_static_and_dynamic, int_dtype): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [1, 2]}, dtype=int_dtype, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": [0,1,2,0]}, dtype=int_dtype, index=self.index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read( + "sym", + date_range=(pd.to_datetime("1/1/2024"), pd.to_datetime("1/1/2024")) + ).data, + pd.DataFrame({"col": [0]}, dtype=int_dtype, index=[pd.to_datetime("1/1/2024")]) + ) + + def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [1, 2]}, dtype=float_dtype, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + {"col": [float("NaN"), 1, 2, float("NaN")]}, + dtype=float_dtype, + index=self.index() + ) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read( + "sym", + date_range=(pd.to_datetime("1/1/2024"), pd.to_datetime("1/1/2024")) + ).data, + pd.DataFrame({"col": [float("NaN")]}, dtype=float_dtype, index=[pd.to_datetime("1/1/2024")]) + ) + + def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": [True, False]}, dtype=boolean_dtype, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + {"col": [None, True, False, None]}, + dtype=boolean_dtype, + index=self.index() + ) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read( + "sym", + date_range=(pd.to_datetime("1/1/2024"), pd.to_datetime("1/1/2024")) + ).data, + pd.DataFrame({"col": [None]}, dtype=boolean_dtype, index=[pd.to_datetime("1/1/2024")]) + ) + + def test_string(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": ["a", 20*"long_string"]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + {"col": [None, "a", 20*"long_string", None]}, + index=self.index() + ) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read( + "sym", + date_range=(pd.to_datetime("1/1/2024"), pd.to_datetime("1/1/2024")) + ).data, + pd.DataFrame({"col": [None]}, index=[pd.to_datetime("1/1/2024")]) + ) + + def test_empty(self, lmdb_version_store_static_and_dynamic): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame({"col": 2*[None]}, index=self.update_index()) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame({"col": 4*[None]}, index=self.index()) + ) + + def test_date(self, lmdb_version_store_static_and_dynamic, date_dtype): + lmdb_version_store_static_and_dynamic.update( + "sym", + pd.DataFrame( + {"col": [np.datetime64('2005-02'), np.datetime64('2005-03')]}, + dtype=date_dtype, + index=self.update_index() + ) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read("sym").data, + pd.DataFrame( + { + "col": [ + np.datetime64('NaT'), + np.datetime64('2005-02'), + np.datetime64('2005-03'), + np.datetime64('NaT') + ] + }, + index=self.index(), + dtype=date_dtype + ) + ) + assert_frame_equal( + lmdb_version_store_static_and_dynamic.read( + "sym", + date_range=(pd.to_datetime("1/1/2024"), pd.to_datetime("1/1/2024")) + ).data, + pd.DataFrame( + { + "col": [np.datetime64('NaT', 'ns')] + }, + index=[pd.to_datetime("1/1/2024")], + dtype=date_dtype + ) + ) + class TestEmptyTypesIsOverriden: def test_cannot_append_different_type_after_first_not_none(self, lmdb_version_store_static_and_dynamic): lmdb_version_store_static_and_dynamic.write("sym", pd.DataFrame({"col": [None, None]})) From 634347274431977ff79864894fca4b329b2886ca Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 2 Feb 2024 20:35:03 +0200 Subject: [PATCH 17/30] Fix conda builds --- cpp/arcticdb/column_store/column.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp index c5ccc192ba..e3e7102973 100644 --- a/cpp/arcticdb/column_store/column.cpp +++ b/cpp/arcticdb/column_store/column.cpp @@ -513,7 +513,7 @@ std::vector> Column::split(const std::shared_ptr } /// Bytes from the underlying chunked buffer to include when truncating. Inclusive of start_byte, exclusive of end_byte -static [[nodiscard]] std::pair column_start_end_bytes( +[[nodiscard]] static std::pair column_start_end_bytes( const Column& column, size_t start_row, size_t end_row @@ -551,7 +551,7 @@ static [[nodiscard]] std::pair column_start_end_bytes( return {start_byte, end_byte}; } -static [[nodiscard]] util::BitMagic truncate_sparse_map( +[[nodiscard]] static util::BitMagic truncate_sparse_map( const util::BitMagic& input_sparse_map, size_t start_row, size_t end_row From 0a36fe692dd7e1871fdbbdbad61bc1e079315af9 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 5 Feb 2024 11:30:13 +0200 Subject: [PATCH 18/30] Fix MSVC release builds --- cpp/arcticdb/storage/library.hpp | 14 ++++++++++---- cpp/arcticdb/storage/lmdb/lmdb_storage.cpp | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/storage/library.hpp b/cpp/arcticdb/storage/library.hpp index 05e339e8c8..0dfea0c59d 100644 --- a/cpp/arcticdb/storage/library.hpp +++ b/cpp/arcticdb/storage/library.hpp @@ -72,8 +72,11 @@ class Library { if (open_mode() < OpenMode::WRITE) throw PermissionException(library_path_, open_mode(), "write"); - size_t ARCTICDB_UNUSED total_size = kvs.fold([] (size_t s, const KeySegmentPair& seg) { return s + seg.segment().total_segment_size(); }, size_t(0)); - auto kv_count ARCTICDB_UNUSED = kvs.size(); + [[maybe_unused]] const size_t total_size = kvs.fold( + [](size_t s, const KeySegmentPair& seg) { return s + seg.segment().total_segment_size(); }, + size_t(0) + ); + [[maybe_unused]] const auto kv_count = kvs.size(); storages_->write(std::move(kvs)); ARCTICDB_TRACE(log::storage(), "{} kv written, {} bytes", kv_count, total_size); } @@ -83,8 +86,11 @@ class Library { if (open_mode() < OpenMode::WRITE) throw PermissionException(library_path_, open_mode(), "update"); - size_t total_size ARCTICDB_UNUSED = kvs.fold([] (size_t s, const KeySegmentPair& seg) { return s + seg.segment().total_segment_size(); }, size_t(0)); - auto kv_count ARCTICDB_UNUSED = kvs.size(); + [[maybe_unused]] const size_t total_size = kvs.fold( + [](size_t s, const KeySegmentPair& seg) { return s + seg.segment().total_segment_size(); }, + size_t(0) + ); + [[maybe_unused]] const auto kv_count = kvs.size(); storages_->update(std::move(kvs), opts); ARCTICDB_TRACE(log::storage(), "{} kv updated, {} bytes", kv_count, total_size); } diff --git a/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp b/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp index 30c86ee873..dbf6b91000 100644 --- a/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp +++ b/cpp/arcticdb/storage/lmdb/lmdb_storage.cpp @@ -153,7 +153,7 @@ bool LmdbStorage::do_key_exists(const VariantKey&key) { MDB_val mdb_key{stored_key.size(), stored_key.data()}; MDB_val mdb_val; return ::lmdb::dbi_get(txn, dbi.handle(), &mdb_key, &mdb_val); - } catch (const ::lmdb::not_found_error &ex) { + } catch ([[maybe_unused]] const ::lmdb::not_found_error &ex) { ARCTICDB_DEBUG(log::storage(), "Caught lmdb not found error: {}", ex.what()); return false; } From 093e4f620540f34b3626be7382c79fda01bc6161 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 5 Feb 2024 12:31:55 +0200 Subject: [PATCH 19/30] More MSVC compilation error fixes --- cpp/arcticdb/storage/s3/mock_s3_client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/storage/s3/mock_s3_client.cpp b/cpp/arcticdb/storage/s3/mock_s3_client.cpp index 31eeb1fefa..6b71c6be93 100644 --- a/cpp/arcticdb/storage/s3/mock_s3_client.cpp +++ b/cpp/arcticdb/storage/s3/mock_s3_client.cpp @@ -50,7 +50,7 @@ std::optional has_failure_trigger(const std::string& s3_object auto failure_code_string = s3_object_name.substr(position + failure_string_for_operation.size()); auto failure_code = Aws::S3::S3Errors(std::stoi(failure_code_string)); return Aws::S3::S3Error(Aws::Client::AWSError(failure_code, "Simulated error", "Simulated error message", true)); - } catch (std::exception& e) { + } catch (std::exception&) { return std::nullopt; } } From ff4bcdd142cf02011478a3727364647cac4b84e9 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 5 Feb 2024 16:37:08 +0200 Subject: [PATCH 20/30] Fix compilation error in MSVC for C++ unit tests --- .../column_store/test/test_memory_segment.cpp | 21 ++++++++++++------- cpp/arcticdb/pipeline/test/test_pipeline.cpp | 4 +++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/column_store/test/test_memory_segment.cpp b/cpp/arcticdb/column_store/test/test_memory_segment.cpp index 6782993c5f..3416242e75 100644 --- a/cpp/arcticdb/column_store/test/test_memory_segment.cpp +++ b/cpp/arcticdb/column_store/test/test_memory_segment.cpp @@ -17,6 +17,7 @@ #include #include +#include TEST(MemSegment, Empty) { using namespace arcticdb; @@ -138,9 +139,9 @@ TEST(MemSegment, IterateWithEmptyTypeColumn) { for (auto&& [idx, row]: folly::enumerate(seg)) { ASSERT_EQ(static_cast(idx), row.scalar_at(0)); // Exception should be thrown regardless of the type requested for empty type columns - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); } } @@ -176,7 +177,13 @@ TEST(MemSegment, ModifyViaIterator) { auto &segment = frame_wrapper.segment_; for (auto &row : segment) { for (auto &value : row) { - value.visit([](auto &v) { v += 1; }); + value.visit([](auto& v) { + if constexpr (std::is_same_v>, bool>) { + v |= 1; + } else { + v += 1; + } + }); } } @@ -479,9 +486,9 @@ TEST(MemSegment, Filter) { for (auto&& [idx, row]: folly::enumerate(filtered_seg)) { ASSERT_EQ(static_cast(retained_rows[idx]), row.scalar_at(0)); // Exception should be thrown regardless of the type requested for empty type columns - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); - EXPECT_THROW(row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); + EXPECT_THROW([[maybe_unused]] auto v = row.scalar_at(1).has_value(), InternalException); } } diff --git a/cpp/arcticdb/pipeline/test/test_pipeline.cpp b/cpp/arcticdb/pipeline/test/test_pipeline.cpp index ac3799d929..0c263c068d 100644 --- a/cpp/arcticdb/pipeline/test/test_pipeline.cpp +++ b/cpp/arcticdb/pipeline/test/test_pipeline.cpp @@ -182,7 +182,9 @@ TEST(Pipeline, Basic) { Pipeline pipeline(ex); TestFilter even_filter{[](const SegmentInMemory::Row& row) { return row[0].visit([](auto val) { - if constexpr(std::is_integral_v) + if constexpr (std::is_same_v) + return val == false; + else if constexpr(std::is_integral_v) return val % 2 == 0; else return false; From c2ef8529d9d01ccf371d910c818d0f1df6e6c34b Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 9 Feb 2024 16:46:47 +0200 Subject: [PATCH 21/30] Fix rewrite partial segment so that it doesn't try to truncate with empty ranges --- cpp/arcticdb/pipeline/write_frame.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index e89356295f..d91facbc09 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -292,7 +292,7 @@ static RowRange partial_rewrite_row_range( return {0, bound->row_pos()}; } else { const timestamp end = std::get(range.end_); - auto bound = std::upper_bound(std::begin(segment), std::end(segment), end, [](timestamp t, auto& row) { + auto bound = std::upper_bound(std::begin(segment), std::end(segment), end, [](timestamp t, const auto& row) { return t < row.template index(); }); return {bound->row_pos(), segment.row_count()}; @@ -339,7 +339,10 @@ std::optional rewrite_partial_segment( const SegmentInMemory& segment = kv.second; const IndexRange affected_index_range = partial_rewrite_index_range(existing_range, index_range, affected_part); const RowRange affected_row_range = partial_rewrite_row_range(segment, index_range, affected_part); - const size_t num_rows = affected_row_range.end() - affected_row_range.start(); + const int64_t num_rows = affected_row_range.end() - affected_row_range.start(); + if (num_rows <= 0) { + return std::nullopt; + } SegmentInMemory output = segment.truncate(affected_row_range.start(), affected_row_range.end(), true); FrameSlice new_slice{ std::make_shared(output.descriptor()), From c6c212fd5dc41aafeea7362de6973432d663b14d Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 14 Feb 2024 13:54:51 +0200 Subject: [PATCH 22/30] Fix string reducer for static schema when empty segment part of a string column is read --- cpp/arcticdb/pipeline/frame_slice_map.hpp | 27 ++++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/arcticdb/pipeline/frame_slice_map.hpp b/cpp/arcticdb/pipeline/frame_slice_map.hpp index 7b23eb4ee2..9579694215 100644 --- a/cpp/arcticdb/pipeline/frame_slice_map.hpp +++ b/cpp/arcticdb/pipeline/frame_slice_map.hpp @@ -8,8 +8,6 @@ #pragma once #include -#include -#include namespace arcticdb::pipelines { @@ -35,13 +33,30 @@ struct FrameSliceMap { continue; } - if(!dynamic_schema && !is_sequence_type(field->type().data_type())) { - ARCTICDB_DEBUG(log::version(), "{} not a string type in dynamic schema, skipping", field->name()); - continue; + const entity::DataType row_range_type = field->type().data_type(); + if(!dynamic_schema && !is_sequence_type(row_range_type)) { + // In case we end up with static schema and empty we must check the type of the whole column + // Because we could be reading an empty segment of a string column. Example: start with [None], + // then append ["string"]. If we read with row range [0;1) we would end up reading the empty segment + // On read the empty type handler will fill the segment with not_a_string() and the reducer must + // run over them. + // TODO: This logic won't be needed when we move string handling into separate type handler + if(is_empty_type(row_range_type)) { + const entity::StreamDescriptor& descriptor = context_->descriptor(); + const size_t global_field_idx = descriptor.find_field(field->name()).value(); + const Field& global_field = descriptor.field(global_field_idx); + const entity::DataType global_field_type = global_field.type().data_type(); + if(!is_sequence_type(global_field_type)) { + ARCTICDB_DEBUG(log::version(), "{} not a string type in dynamic schema, skipping", field->name()); + continue; + } + } else { + ARCTICDB_DEBUG(log::version(), "{} not a string type in dynamic schema, skipping", field->name()); + continue; + } } auto& column = columns_[field->name()]; - ContextData data{context_row.index_, field.index}; column.insert(std::make_pair(row_range, data)); } From 224916dd7bb37eb01be9c29c021668ab924aa814 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 14 Feb 2024 14:09:21 +0200 Subject: [PATCH 23/30] Add comment on how pandas handles None when dtype is bool --- .../unit/arcticdb/version_store/test_empty_column_type.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py index 58cccc2cb4..e033889d49 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_column_type.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_column_type.py @@ -69,6 +69,7 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + # Note: if dtype is bool pandas will convert None to False df_non_empty = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) lmdb_version_store_static_and_dynamic.append("sym", df_non_empty) expected_result = pd.DataFrame({"col1": np.array([None, None, True, False, True], dtype=boolean_dtype)}) @@ -185,6 +186,7 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + # Note: if dtype is bool pandas will convert None to False df_initial = pd.DataFrame({"col1": np.array([True, False, True], dtype=boolean_dtype)}) df_with_none = pd.DataFrame({"col1": np.array([None, None])}) lmdb_version_store_static_and_dynamic.write("sym", df_initial) @@ -295,6 +297,7 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + # Note: if dtype is bool pandas will convert None to False lmdb_version_store_static_and_dynamic.write( "sym", pd.DataFrame({"col": [True, True, True, True]}, dtype=boolean_dtype, index=self.index()) @@ -418,6 +421,7 @@ def test_float(self, lmdb_version_store_static_and_dynamic, float_dtype): ) def test_bool(self, lmdb_version_store_static_and_dynamic, boolean_dtype): + # Note: if dtype is bool pandas will convert None to False lmdb_version_store_static_and_dynamic.update( "sym", pd.DataFrame({"col": [True, False]}, dtype=boolean_dtype, index=self.update_index()) From a18f2b2139e3c4a71d4d27cdc245ac6fce3748fb Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 14 Feb 2024 15:00:09 +0200 Subject: [PATCH 24/30] Use std::array for flatten_and_fix_rows instead of boost::span --- cpp/arcticdb/pipeline/write_frame.cpp | 4 +++- cpp/arcticdb/pipeline/write_frame.hpp | 22 +++++++++++++++++----- cpp/arcticdb/version/version_core.cpp | 4 ++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp index d91facbc09..8a274f51b6 100644 --- a/cpp/arcticdb/pipeline/write_frame.cpp +++ b/cpp/arcticdb/pipeline/write_frame.cpp @@ -27,6 +27,8 @@ #include #include #include +#include +#include namespace arcticdb::pipelines { @@ -361,7 +363,7 @@ std::optional rewrite_partial_segment( return SliceAndKey{std::move(new_slice), std::get(std::move(fut_key).get())}; } -std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count) { +std::vector flatten_and_fix_rows(const std::array, 5>& groups, size_t& global_count) { std::vector output; output.reserve(groups.size()); global_count = 0; diff --git a/cpp/arcticdb/pipeline/write_frame.hpp b/cpp/arcticdb/pipeline/write_frame.hpp index 531761cd7a..1fee3e7654 100644 --- a/cpp/arcticdb/pipeline/write_frame.hpp +++ b/cpp/arcticdb/pipeline/write_frame.hpp @@ -9,7 +9,7 @@ #include -#include + #include #include #include @@ -19,8 +19,6 @@ #include #include -#include - namespace arcticdb::pipelines { using namespace arcticdb::stream; @@ -74,7 +72,21 @@ std::optional rewrite_partial_segment( AffectedSegmentPart affected_part, const std::shared_ptr& store); -// TODO: Use std::span when C++20 is enabled -std::vector flatten_and_fix_rows(boost::span> groups, size_t& global_count); + +/// Used, when updating a segment, to convert all 5 affected groups into a single list of slices +/// The 5 groups are: +/// * Segments before the update range which do not intersect with it and are not affected by +/// the update +/// * Segments before the update range which are intersecting with it and are partially affected +/// by the update. +/// * Segments which are fully contained inside the update range +/// * Segments after the update range which are intersecting with it and are partially affected +/// by the update +/// * Segments after the update range which do not intersect with it and are not affected by the +/// update +std::vector flatten_and_fix_rows( + const std::array, 5>& groups, + size_t& global_count +); } //namespace arcticdb::pipelines \ No newline at end of file diff --git a/cpp/arcticdb/version/version_core.cpp b/cpp/arcticdb/version/version_core.cpp index fe50495606..a575ed9ca5 100644 --- a/cpp/arcticdb/version/version_core.cpp +++ b/cpp/arcticdb/version/version_core.cpp @@ -283,7 +283,7 @@ VersionedItem delete_range_impl( std::move(intersect_before), std::move(intersect_after), strictly_after(orig_filter_range, unaffected_keys)}; - auto flattened_slice_and_keys = flatten_and_fix_rows(boost::span{groups}, row_count); + auto flattened_slice_and_keys = flatten_and_fix_rows(groups, row_count); std::sort(std::begin(flattened_slice_and_keys), std::end(flattened_slice_and_keys)); bool bucketize_dynamic = index_segment_reader.bucketize_dynamic(); @@ -387,7 +387,7 @@ VersionedItem update_impl( std::move(new_slice_and_keys), std::move(intersect_after), strictly_after(orig_filter_range, unaffected_keys)}; - auto flattened_slice_and_keys = flatten_and_fix_rows(boost::span{groups}, row_count); + auto flattened_slice_and_keys = flatten_and_fix_rows(groups, row_count); util::check(unaffected_keys.size() + new_keys_size + (affected_keys.size() * 2) >= flattened_slice_and_keys.size(), "Output size mismatch: {} + {} + (2 * {}) < {}", From 3033303beb3b656b73ab80530ed6f87d47ec7a42 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 14 Feb 2024 18:14:11 +0200 Subject: [PATCH 25/30] Refactor string pool reconstruction in segment truncation as per PR comment suggestion --- .../column_store/memory_segment_impl.cpp | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index f483b7b4a9..5102910734 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -441,25 +441,24 @@ std::shared_ptr SegmentInMemoryImpl::truncate( output->set_metadata(std::move(metadata)); } - for(const auto&& [idx, column] : folly::enumerate(columns_)) { + for (const auto&& [idx, column] : folly::enumerate(columns_)) { const DataType column_type = column->type().data_type(); const Field& field = descriptor_->field(idx); + std::shared_ptr truncated_column = Column::truncate(column, start_row, end_row); + if (is_sequence_type(column_type) && reconstruct_string_pool) { - output->add_column(field, num_values, true); - const ChunkedBuffer& source = column->data().buffer(); - ChunkedBuffer& target = output->column(idx).data().buffer(); - for (size_t row = 0; row < num_values; ++row) { - const StringPool::offset_t offset = util::variant_match( - get_string_from_buffer(start_row + row, source, *string_pool_), - [&](std::string_view sv) { return output->string_pool().get(sv).offset(); }, - [](StringPool::offset_t offset) { return offset; } - ); - set_offset_string_at(row, target, offset); + ChunkedBuffer& truncated_buffer = truncated_column->data().buffer(); + for (size_t row = 0; row < truncated_column->row_count(); ++row) { + const entity::position_t offset_val = get_offset_string_at(row, truncated_buffer); + if (is_a_string(offset_val)) { + const std::string_view string = get_string_from_pool(offset_val, *string_pool_); + const StringPool::offset_t new_offset = output->string_pool().get(string).offset(); + set_offset_string_at(row, truncated_buffer, new_offset); + } } - } else { - auto truncated_column = Column::truncate(column, start_row, end_row); - output->add_column(field, truncated_column); } + + output->add_column(field, truncated_column); } output->attach_descriptor(descriptor_); return output; From 0d0c31e04835876511e6b2cf0c6530d9ed2812e8 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Wed, 14 Feb 2024 19:28:05 +0200 Subject: [PATCH 26/30] Fix compilation errors under linux --- cpp/arcticdb/column_store/memory_segment_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index 5102910734..c2aa86d32e 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -448,7 +448,7 @@ std::shared_ptr SegmentInMemoryImpl::truncate( if (is_sequence_type(column_type) && reconstruct_string_pool) { ChunkedBuffer& truncated_buffer = truncated_column->data().buffer(); - for (size_t row = 0; row < truncated_column->row_count(); ++row) { + for (position_t row = 0; row < truncated_column->row_count(); ++row) { const entity::position_t offset_val = get_offset_string_at(row, truncated_buffer); if (is_a_string(offset_val)) { const std::string_view string = get_string_from_pool(offset_val, *string_pool_); From 5684779a8ad22860648a7305f0ca4f80698a6347 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Thu, 15 Feb 2024 15:31:43 +0200 Subject: [PATCH 27/30] Prevent creation of sparse map for empty typed columns --- cpp/arcticdb/codec/codec-inl.hpp | 1 + cpp/arcticdb/codec/codec.cpp | 3 ++- cpp/arcticdb/column_store/column.cpp | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/arcticdb/codec/codec-inl.hpp b/cpp/arcticdb/codec/codec-inl.hpp index 1c628cb928..cb170cf6a8 100644 --- a/cpp/arcticdb/codec/codec-inl.hpp +++ b/cpp/arcticdb/codec/codec-inl.hpp @@ -131,6 +131,7 @@ std::size_t decode_ndarray( } if(field.sparse_map_bytes()) { + util::check(!is_empty_type(type_desc_tag.data_type()), "Empty typed columns should not have sparse map"); util::check_magic(data_in); const auto bitmap_size = field.sparse_map_bytes() - util::combined_bit_magic_delimiters_size(); bv = util::deserialize_bytes_to_bitmap(data_in, bitmap_size); diff --git a/cpp/arcticdb/codec/codec.cpp b/cpp/arcticdb/codec/codec.cpp index 395479502c..1aeebc689a 100644 --- a/cpp/arcticdb/codec/codec.cpp +++ b/cpp/arcticdb/codec/codec.cpp @@ -514,7 +514,8 @@ void encode_sparse_map( Buffer& out, std::ptrdiff_t& pos ) { - if (column_data.bit_vector() != nullptr && column_data.bit_vector()->count() > 0) { + if (column_data.bit_vector() != nullptr && column_data.bit_vector()->count() > 0) { + util::check(!is_empty_type(column_data.type().data_type()), "Empty typed columns should not have sparse maps"); ARCTICDB_DEBUG(log::codec(), "Sparse map count = {} pos = {}", column_data.bit_vector()->count(), pos); const size_t sparse_bm_bytes = encode_bitmap(*column_data.bit_vector(), out, pos); util::variant_match(variant_field, [&](auto field) { diff --git a/cpp/arcticdb/column_store/column.cpp b/cpp/arcticdb/column_store/column.cpp index e3e7102973..c8d991efc8 100644 --- a/cpp/arcticdb/column_store/column.cpp +++ b/cpp/arcticdb/column_store/column.cpp @@ -323,6 +323,9 @@ void Column::default_initialize_rows(size_t start_pos, size_t num_rows, bool ens } void Column::set_row_data(size_t row_id) { + if (is_empty_type(type_.data_type())) { + return; + } last_logical_row_ = row_id; const auto last_stored_row = row_count() - 1; if(sparse_map_) { @@ -592,10 +595,7 @@ std::shared_ptr Column::truncate(const std::shared_ptr& column, auto buffer = ::arcticdb::truncate(column->data_.buffer(), start_byte, end_byte); auto res = std::make_shared(column->type(), column->allow_sparse_, std::move(buffer)); if (column->is_sparse()) { - util::BitMagic output_sparse_map = is_empty_type(column->type().data_type()) - ? util::BitMagic{} - : truncate_sparse_map(column->sparse_map(), start_row, end_row); - res->set_sparse_map(std::move(output_sparse_map)); + res->set_sparse_map(truncate_sparse_map(column->sparse_map(), start_row, end_row)); } res->set_row_data(end_row - (start_row + 1)); return res; From 275b62d48a5306d5b562857c9ceac9e8e8673a48 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 16 Feb 2024 18:24:18 +0200 Subject: [PATCH 28/30] Use Column::transform when reconstructing the string pool Use template type-deduced functor in Column::transform instead of folly::Function. --- cpp/arcticdb/column_store/column.hpp | 17 +++++++++--- .../column_store/memory_segment_impl.cpp | 27 +++++++++---------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/cpp/arcticdb/column_store/column.hpp b/cpp/arcticdb/column_store/column.hpp index da08078812..b5698318d8 100644 --- a/cpp/arcticdb/column_store/column.hpp +++ b/cpp/arcticdb/column_store/column.hpp @@ -669,6 +669,9 @@ class Column { * When adding new uses of these static methods, add an internal::check<...>(f.heapAllocatedMemory() == 0,...) * and run all of the tests in CI on all supported platforms. We do not want these checks in the released code, * but also do not want any passed in lambdas to be heap allocated. + * + * TODO: Use type-deduced functor (with enable_if to verify it's in/out types) in all functions similar to transform. + * In case C++20 is available use concepts. */ template @@ -680,13 +683,21 @@ class Column { }); } - template + template < + typename input_tdt, + typename output_tdt, + typename FunctorT, + typename = std::enable_if_t>> static void transform(const Column& input_column, Column& output_column, - folly::Function&& f) { + FunctorT&& f + ) { auto input_data = input_column.data(); auto output_data = output_column.data(); - std::transform(input_data.cbegin(), input_data.cend(), output_data.begin(), std::move(f)); + std::transform(input_data.cbegin(), input_data.cend(), output_data.begin(), std::forward(f)); } template diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index c2aa86d32e..54342c9ff8 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -442,23 +442,22 @@ std::shared_ptr SegmentInMemoryImpl::truncate( } for (const auto&& [idx, column] : folly::enumerate(columns_)) { - const DataType column_type = column->type().data_type(); + const TypeDescriptor column_type = column->type(); const Field& field = descriptor_->field(idx); std::shared_ptr truncated_column = Column::truncate(column, start_row, end_row); - - if (is_sequence_type(column_type) && reconstruct_string_pool) { - ChunkedBuffer& truncated_buffer = truncated_column->data().buffer(); - for (position_t row = 0; row < truncated_column->row_count(); ++row) { - const entity::position_t offset_val = get_offset_string_at(row, truncated_buffer); - if (is_a_string(offset_val)) { - const std::string_view string = get_string_from_pool(offset_val, *string_pool_); - const StringPool::offset_t new_offset = output->string_pool().get(string).offset(); - set_offset_string_at(row, truncated_buffer, new_offset); - } + column_type.visit_tag([&](auto tag) { + if constexpr (is_sequence_type(decltype(tag)::data_type())) { + Column::transform( + *truncated_column, + *truncated_column, + [this, &output](auto string_pool_offset) { + const std::string_view string = get_string_from_pool(string_pool_offset, *string_pool_); + return output->string_pool().get(string).offset(); + } + ); } - } - - output->add_column(field, truncated_column); + }); + output->add_column(field, std::move(truncated_column)); } output->attach_descriptor(descriptor_); return output; From e6e3e238b1b95edc9f24f442da94228321aab668 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Fri, 16 Feb 2024 19:50:37 +0200 Subject: [PATCH 29/30] Fix MSVC compilation errors --- cpp/arcticdb/storage/azure/azure_mock_client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/arcticdb/storage/azure/azure_mock_client.cpp b/cpp/arcticdb/storage/azure/azure_mock_client.cpp index 5b0bf3b5ce..ef848f3d48 100644 --- a/cpp/arcticdb/storage/azure/azure_mock_client.cpp +++ b/cpp/arcticdb/storage/azure/azure_mock_client.cpp @@ -52,7 +52,7 @@ std::optional has_failure_trigger(const std auto error_message = fmt::format("Simulated Error, message: #{}_{}", failure_string_for_operation, (int) status_code); return get_exception(error_message, status_code); - } catch (std::exception& e) { + } catch (std::exception&) { return std::nullopt; } } From aeff5a26643b6cae3b1fd83bbac2ee405413a512 Mon Sep 17 00:00:00 2001 From: Vasil Pashov Date: Mon, 19 Feb 2024 15:01:06 +0200 Subject: [PATCH 30/30] Fix string pool reconstruction by adding checks for is_a_string --- cpp/arcticdb/column_store/memory_segment_impl.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/arcticdb/column_store/memory_segment_impl.cpp b/cpp/arcticdb/column_store/memory_segment_impl.cpp index 54342c9ff8..d834561ad4 100644 --- a/cpp/arcticdb/column_store/memory_segment_impl.cpp +++ b/cpp/arcticdb/column_store/memory_segment_impl.cpp @@ -450,9 +450,12 @@ std::shared_ptr SegmentInMemoryImpl::truncate( Column::transform( *truncated_column, *truncated_column, - [this, &output](auto string_pool_offset) { - const std::string_view string = get_string_from_pool(string_pool_offset, *string_pool_); - return output->string_pool().get(string).offset(); + [this, &output](auto string_pool_offset) -> arcticdb::OffsetString::offset_t { + if (is_a_string(string_pool_offset)) { + const std::string_view string = get_string_from_pool(string_pool_offset, *string_pool_); + return output->string_pool().get(string).offset(); + } + return string_pool_offset; } ); }