Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Empty type #1227

Merged
merged 36 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e8e5fae
Add src and dest type descriptors to decode or expand. Add backfilling.
Jan 10, 2024
0ab4c22
Fix string backfills in emoty type. Add test cases for all types to c…
Jan 11, 2024
d350261
Add a test for appending none to int
Jan 15, 2024
8eb7253
Another batch of fixes for the empty data type
Jan 22, 2024
840d899
Use boost span for flatten_and_fix_rows
Jan 25, 2024
1c97adf
Use TypeDescriptor constructor instead of static_cast
Jan 25, 2024
5638585
Use truncate columns on update and fix pybool type handler so that if…
Jan 29, 2024
0efb037
Complete tesing for updating with empty type colummn
Jan 30, 2024
f25b788
Additional test
Jan 30, 2024
17b642c
FWD declare buffer holder
Jan 31, 2024
0334f45
Add a single function to handle default initialization.
Feb 1, 2024
e3d71cb
Change pybool type name so a language agnostic name bool_object
Feb 1, 2024
907f103
Merge branch 'master' into empty-type-fixes
Feb 1, 2024
868c607
Skip nullable booleans in empty types tests
Feb 1, 2024
5f244ca
Fix compiler warnings
Feb 1, 2024
3d8c784
Work on matching compiler flags
Feb 2, 2024
8d0484a
Add tests for updating an empty column
Feb 2, 2024
6343472
Fix conda builds
Feb 2, 2024
0a36fe6
Fix MSVC release builds
Feb 5, 2024
98687fa
Merge branch 'master' into empty-type-fixes
Feb 5, 2024
093e4f6
More MSVC compilation error fixes
Feb 5, 2024
ff4bcdd
Fix compilation error in MSVC for C++ unit tests
Feb 5, 2024
c2ef852
Fix rewrite partial segment so that it doesn't try to truncate with e…
Feb 9, 2024
507fdaa
Merge branch 'master' into empty-type-fixes-2
Feb 9, 2024
c6c212f
Fix string reducer for static schema when empty segment part of a str…
Feb 14, 2024
b3d477f
Merge branch 'master' into empty-type-fixes
Feb 14, 2024
224916d
Add comment on how pandas handles None when dtype is bool
Feb 14, 2024
a18f2b2
Use std::array for flatten_and_fix_rows instead of boost::span
Feb 14, 2024
3033303
Refactor string pool reconstruction in segment truncation as per PR c…
Feb 14, 2024
0d0c31e
Fix compilation errors under linux
Feb 14, 2024
5684779
Prevent creation of sparse map for empty typed columns
Feb 15, 2024
275b62d
Use Column::transform when reconstructing the string pool
Feb 16, 2024
e0d2c8d
Merge branch 'master' into empty-type-fixes-2
Feb 16, 2024
e6e3e23
Fix MSVC compilation errors
Feb 16, 2024
aeff5a2
Fix string pool reconstruction by adding checks for is_a_string
Feb 19, 2024
380663f
Merge branch 'master' into empty-type-fixes
Feb 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ ENDIF()

set(CMAKE_INSTALL_RPATH "\\$ORIGIN/lib")

if (NOT WIN32)
if (NOT MSVC)
if(NOT ${ARCTICDB_USING_CONDA})
# Compilers on conda-forge are relatively strict.
# For now, we do not want to treat warnings as errors for those builds.
Expand All @@ -166,6 +166,13 @@ if (NOT WIN32)
add_compile_options("-ggdb")
add_compile_options("-fvisibility=hidden")
add_compile_options("-fno-omit-frame-pointer")
else()
add_compile_options(
/WX
/wd4250 # Suppress inherits via dominance warning
/we4189 # Unused local variable is an error
/we4100 # Unused function parameter is an error
)
endif ()

## Core library without python bindings ##
Expand Down Expand Up @@ -200,6 +207,7 @@ set(arcticdb_srcs
entity/key.hpp
entity/metrics.hpp
entity/data_error.hpp
entity/descriptors.hpp
entity/native_tensor.hpp
entity/performance_tracing.hpp
entity/protobuf_mappings.hpp
Expand All @@ -208,6 +216,7 @@ set(arcticdb_srcs
entity/serialized_key.hpp
entity/type_conversion.hpp
entity/types.hpp
entity/type_utils.hpp
entity/types-inl.hpp
entity/variant_key.hpp
entity/versioned_item.hpp
Expand All @@ -233,7 +242,6 @@ set(arcticdb_srcs
pipeline/read_pipeline.hpp
pipeline/slicing.hpp
pipeline/string_pool_utils.hpp
pipeline/string_pool_utils.hpp
pipeline/value.hpp
pipeline/value_set.hpp
pipeline/write_frame.hpp
Expand Down Expand Up @@ -376,7 +384,9 @@ set(arcticdb_srcs
entity/metrics.cpp
entity/performance_tracing.cpp
entity/types.cpp
entity/type_utils.cpp
log/log.cpp
pipeline/column_mapping.cpp
pipeline/column_stats.cpp
pipeline/frame_slice.cpp
pipeline/frame_utils.cpp
Expand Down
4 changes: 0 additions & 4 deletions cpp/arcticdb/codec/codec-inl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@

#include <arcticdb/util/bitset.hpp>

#include <arcticdb/util/buffer.hpp>

#include <google/protobuf/text_format.h>

#include <type_traits>

namespace arcticdb {
Expand Down
52 changes: 33 additions & 19 deletions cpp/arcticdb/codec/codec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,11 +310,12 @@ void decode_string_pool( const arcticdb::proto::encoding::SegmentHeader& hdr,
}
}

void decode_v2(const Segment& segment,
arcticdb::proto::encoding::SegmentHeader& hdr,
SegmentInMemory& res,
const StreamDescriptor& desc)
{
void decode_v2(
const Segment& segment,
arcticdb::proto::encoding::SegmentHeader& hdr,
SegmentInMemory& res,
const StreamDescriptor& desc
) {
ARCTICDB_SAMPLE(DecodeSegment, 0)
const auto [begin, end] = get_segment_begin_end(segment, hdr);
auto encoded_fields_ptr = end;
Expand Down Expand Up @@ -364,11 +365,12 @@ void decode_v2(const Segment& segment,
res.set_compacted(segment.header().compacted());
}}

void decode_v1(const Segment& segment,
const arcticdb::proto::encoding::SegmentHeader& hdr,
SegmentInMemory& res,
StreamDescriptor::Proto& desc)
{
void decode_v1(
const Segment& segment,
const arcticdb::proto::encoding::SegmentHeader& hdr,
SegmentInMemory& res,
StreamDescriptor::Proto& desc
) {
ARCTICDB_SAMPLE(DecodeSegment, 0)
const uint8_t* data = segment.buffer().data();
util::check(data != nullptr, "Got null data ptr from segment");
Expand All @@ -384,17 +386,29 @@ void decode_v1(const Segment& segment,
const auto seg_row_count = fields_size ? ssize_t(hdr.fields(0).ndarray().items_count()) : 0LL;
res.init_column_map();

for (std::size_t i = 0; i < static_cast<size_t>(fields_size); ++i) {
const auto& field = hdr.fields(static_cast<int>(i));
const auto& field_name = desc.fields(static_cast<int>(i)).name();
util::check(data!=end, "Reached end of input block with {} fields to decode", fields_size-i);
if(auto col_index = res.column_index(field_name)) {
for (int i = 0; i < fields_size; ++i) {
const auto& field = hdr.fields(i);
const auto& field_name = desc.fields(i).name();
if (auto col_index = res.column_index(field_name)) {
auto& col = res.column(static_cast<position_t>(*col_index));
data += decode_field(res.field(*col_index).type(), field, data, col, col.opt_sparse_map(), to_encoding_version(hdr.encoding_version()));
} else
util::check(
data != end || is_empty_type(col.type().data_type()),
"Reached end of input block with {} fields to decode",
fields_size - i
);
data += decode_field(
res.field(*col_index).type(),
field,
data,
col,
col.opt_sparse_map(),
to_encoding_version(hdr.encoding_version())
);
} else {
util::check(data != end, "Reached end of input block with {} fields to decode", fields_size - i);
data += encoding_sizes::field_compressed_size(field);

ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", i, data-begin);
}
ARCTICDB_TRACE(log::codec(), "Decoded column {} to position {}", i, data - begin);
}

decode_string_pool(hdr, data, begin, end, res);
Expand Down
35 changes: 19 additions & 16 deletions cpp/arcticdb/codec/zstd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,28 +51,31 @@ struct ZstdBlockEncoder {

struct ZstdDecoder {

/*
* encoder_version is here to support multiple versions but won't be used before we have them
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"

template<typename T>
/// @param[in] encoder_version Used to support multiple versions but won't be used before we have them
template <typename T>
static void decode_block(
std::uint32_t encoder_version,
const std::uint8_t *in,
[[maybe_unused]] std::uint32_t encoder_version,
const std::uint8_t* in,
std::size_t in_bytes,
T *t_out,
std::size_t out_bytes) {
T* t_out,
std::size_t out_bytes
) {

const std::size_t decomp_size = ZSTD_getFrameContentSize(in, in_bytes);
codec::check<ErrorCode::E_DECODE_ERROR>(decomp_size == out_bytes, "expected out_bytes == zstd deduced bytes, actual {} != {}",
out_bytes, decomp_size);
codec::check<ErrorCode::E_DECODE_ERROR>(
decomp_size == out_bytes,
"expected out_bytes == zstd deduced bytes, actual {} != {}",
out_bytes,
decomp_size
);
std::size_t real_decomp = ZSTD_decompress(t_out, out_bytes, in, in_bytes);
codec::check<ErrorCode::E_DECODE_ERROR>(real_decomp == out_bytes, "expected out_bytes == zstd decompressed bytes, actual {} != {}",
out_bytes, real_decomp);
codec::check<ErrorCode::E_DECODE_ERROR>(
real_decomp == out_bytes,
"expected out_bytes == zstd decompressed bytes, actual {} != {}",
out_bytes,
real_decomp
);
}
#pragma GCC diagnostic pop
};

} // namespace arcticdb::detail
149 changes: 88 additions & 61 deletions cpp/arcticdb/column_store/column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ void Column::unsparsify(size_t num_rows) {
if(!sparse_map_)
return;

type_.visit_tag([that=this, num_rows] (const auto tdt) {
type_.visit_tag([this, num_rows] (const auto tdt) {
using TagType = decltype(tdt);
using RawType = typename TagType::DataTypeTag::raw_type;
const auto dest_bytes = num_rows * sizeof(RawType);
auto dest = ChunkedBuffer::presized(dest_bytes);
util::default_initialize<TagType>(dest.data(), dest_bytes);
util::expand_dense_buffer_using_bitmap<RawType>(that->sparse_map_.value(), that->data_.buffer().data(), dest.data());
std::swap(dest, that->data_.buffer());
util::expand_dense_buffer_using_bitmap<RawType>(sparse_map_.value(), data_.buffer().data(), dest.data());
std::swap(dest, data_.buffer());
});
sparse_map_ = std::nullopt;
last_logical_row_ = last_physical_row_ = static_cast<ssize_t>(num_rows) - 1;
Expand All @@ -106,14 +106,13 @@ void Column::unsparsify(size_t num_rows) {
}

void Column::sparsify() {
type().visit_tag([that=this](auto type_desc_tag) {
using TDT = decltype(type_desc_tag);
using RawType = typename TDT::DataTypeTag::raw_type;
if constexpr(is_floating_point_type(TDT::DataTypeTag::data_type)) {
auto raw_ptr =reinterpret_cast<const RawType*>(that->ptr());
auto buffer = util::scan_floating_point_to_sparse(raw_ptr, that->row_count(), that->sparse_map());
std::swap(that->data().buffer(), buffer);
that->last_physical_row_ = that->sparse_map().count() - 1;
type().visit_tag([this](auto type_desc_tag) {
using RawType = typename decltype(type_desc_tag)::DataTypeTag::raw_type;
if constexpr (is_floating_point_type(type_desc_tag.data_type())) {
auto raw_ptr = reinterpret_cast<const RawType*>(ptr());
auto buffer = util::scan_floating_point_to_sparse(raw_ptr, row_count(), sparse_map());
std::swap(data().buffer(), buffer);
last_physical_row_ = sparse_map().count() - 1;
}
});
}
Expand Down Expand Up @@ -513,65 +512,89 @@ std::vector<std::shared_ptr<Column>> Column::split(const std::shared_ptr<Column>
return output;
}

// Produces a new column
// Inclusive of start_row, exclusive of end_row
std::shared_ptr<Column> Column::truncate(const std::shared_ptr<Column>& column, size_t start_row, size_t end_row) {
auto type_size = get_type_size(column->type().data_type());
// Bytes from the underlying chunked buffer to include. Inclusive of start_byte, exclusive of end_byte
size_t start_byte;
size_t end_byte;
util::BitMagic input_sparse_map;
if (column->is_sparse()) {
input_sparse_map = column->sparse_map();
internal::check<ErrorCode::E_ASSERTION_FAILURE>(input_sparse_map.size() > 0, "Unexpected empty sparse map in Column::truncate");
// Sparse columns do not include trailing 0s in the bitset, so the relevant end_row is capped at the size of the biset
/// Bytes from the underlying chunked buffer to include when truncating. Inclusive of start_byte, exclusive of end_byte
[[nodiscard]] static std::pair<size_t, size_t> column_start_end_bytes(
const Column& column,
size_t start_row,
size_t end_row
) {
const size_t type_size = get_type_size(column.type().data_type());
size_t start_byte = start_row * type_size;
size_t end_byte = end_row * type_size;
if (column.is_sparse()) {
const util::BitMagic& input_sparse_map = column.sparse_map();
internal::check<ErrorCode::E_ASSERTION_FAILURE>(
input_sparse_map.size() > 0,
"Unexpected empty sparse map in Column::truncate"
);
// Sparse columns do not include trailing 0s in the bitset, so the relevant end_row is capped at the size of the
// biset
end_row = std::min(end_row, static_cast<size_t>(input_sparse_map.size()));
// count_range is inclusive at both ends
auto set_bits_before_range = start_row > 0 ? input_sparse_map.count_range(0, start_row - 1) : 0;
auto set_bits_in_range = input_sparse_map.count_range(start_row, end_row - 1);
const size_t set_bits_before_range = start_row > 0 ? input_sparse_map.count_range(0, start_row - 1) : 0;
const size_t set_bits_in_range = input_sparse_map.count_range(start_row, end_row - 1);
start_byte = set_bits_before_range * type_size;
end_byte = start_byte + (set_bits_in_range * type_size);
} else {
start_byte = start_row * type_size;
end_byte = end_row * type_size;
}
internal::check<ErrorCode::E_ASSERTION_FAILURE>(
start_byte % type_size == 0,
"start_byte {} is not a multiple of type size {}", start_byte, column->type());
start_byte % type_size == 0,
"start_byte {} is not a multiple of type size {}",
start_byte,
column.type()
);
internal::check<ErrorCode::E_ASSERTION_FAILURE>(
end_byte % type_size == 0,
"start_byte {} is not a multiple of type size {}", end_byte, column->type());
end_byte % type_size == 0,
"start_byte {} is not a multiple of type size {}",
end_byte,
column.type()
);
return {start_byte, end_byte};
}

[[nodiscard]] static util::BitMagic truncate_sparse_map(
const util::BitMagic& input_sparse_map,
size_t start_row,
size_t end_row
) {
// The output sparse map is the slice [start_row, end_row) of the input sparse map
// BitMagic doesn't have a method for this, so hand-roll it here
// Ctor parameter is the size
util::BitMagic output_sparse_map(end_row - start_row);
util::BitSet::bulk_insert_iterator inserter(output_sparse_map);
util::BitSetSizeType set_input_bit;
if (start_row == 0) {
// get_first can return 0 if no bits are set, but we checked earlier that input_sparse_map.size() > 0,
// and we do not have sparse maps with no bits set (except for the empty type)
set_input_bit = input_sparse_map.get_first();
if (set_input_bit < end_row) {
inserter = set_input_bit;
}
} else {
set_input_bit = input_sparse_map.get_next(start_row - 1);
// get_next returns 0 if no more bits are set
if (set_input_bit != 0 && set_input_bit < end_row) {
// Shift start_row elements to the left
inserter = set_input_bit - start_row;
}
}
do {
set_input_bit = input_sparse_map.get_next(set_input_bit);
if (set_input_bit != 0 && set_input_bit < end_row) {
inserter = set_input_bit - start_row;
}
} while (set_input_bit != 0);
inserter.flush();
return output_sparse_map;
}

std::shared_ptr<Column> Column::truncate(const std::shared_ptr<Column>& column, size_t start_row, size_t end_row) {
const auto [start_byte, end_byte] = column_start_end_bytes(*column, start_row, end_row);
auto buffer = ::arcticdb::truncate(column->data_.buffer(), start_byte, end_byte);
auto res = std::make_shared<Column>(column->type(), column->allow_sparse_, std::move(buffer));
if (column->is_sparse()) {
// The output sparse map is the slice [start_row, end_row) of the input sparse map
// BitMagic doesn't have a method for this, so hand-roll it here
// Ctor parameter is the size
util::BitMagic output_sparse_map(end_row - start_row);
util::BitSet::bulk_insert_iterator inserter(output_sparse_map);
util::BitSetSizeType set_input_bit;
if (start_row == 0) {
// get_first can return 0 if no bits are set, but we checked earlier that input_sparse_map.size() > 0,
// and we do not have sparse maps with no bits set
set_input_bit = input_sparse_map.get_first();
if (set_input_bit < end_row) {
inserter = set_input_bit;
}
} else {
set_input_bit = input_sparse_map.get_next(start_row - 1);
// get_next returns 0 if no more bits are set
if (set_input_bit != 0 && set_input_bit < end_row) {
// Shift start_row elements to the left
inserter = set_input_bit - start_row;
}
}
do {
set_input_bit = input_sparse_map.get_next(set_input_bit);
if (set_input_bit != 0 && set_input_bit < end_row) {
inserter = set_input_bit - start_row;
}
} while (set_input_bit != 0);
inserter.flush();
util::BitMagic output_sparse_map = is_empty_type(column->type().data_type())
? util::BitMagic{}
vasil-pashov marked this conversation as resolved.
Show resolved Hide resolved
: truncate_sparse_map(column->sparse_map(), start_row, end_row);
res->set_sparse_map(std::move(output_sparse_map));
}
res->set_row_data(end_row - (start_row + 1));
Expand Down Expand Up @@ -675,7 +698,7 @@ void Column::regenerate_offsets() const {
}
}

[[nodiscard]] util::BitMagic& Column::sparse_map() {
util::BitMagic& Column::sparse_map() {
if(!sparse_map_)
sparse_map_ = std::make_optional<util::BitMagic>(0);

Expand All @@ -687,4 +710,8 @@ const util::BitMagic& Column::sparse_map() const {
return sparse_map_.value();
}

std::optional<util::BitMagic>& Column::opt_sparse_map() {
return sparse_map_;
}

} //namespace arcticdb
Loading
Loading