From 93a31e12431fb82b997137fd99270a9844f9b9d6 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Wed, 8 May 2024 23:46:08 +0800 Subject: [PATCH 01/27] done Signed-off-by: Lloyd-Pottiger --- dbms/CMakeLists.txt | 3 +- .../Compression/CompressionCodecDeltaFOR.cpp | 161 +------- .../IO/Compression/CompressionCodecFOR.cpp | 127 +----- dbms/src/IO/Compression/CompressionCodecFOR.h | 6 - .../CompressionCodecIntegerLightweight.cpp | 367 ++++++++++++++++++ .../CompressionCodecIntegerLightweight.h | 111 ++++++ .../IO/Compression/CompressionCodecRLE.cpp | 57 +-- dbms/src/IO/Compression/CompressionFactory.h | 3 + dbms/src/IO/Compression/CompressionInfo.h | 1 + dbms/src/IO/Compression/CompressionMethod.h | 1 + dbms/src/IO/Compression/CompressionSettings.h | 2 + dbms/src/IO/Compression/EncodingUtil.cpp | 253 ++++++++++++ dbms/src/IO/Compression/EncodingUtil.h | 251 ++++++++++++ .../tests/gtest_codec_compression.cpp | 1 + 14 files changed, 1022 insertions(+), 322 deletions(-) create mode 100644 dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp create mode 100644 dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h create mode 100644 dbms/src/IO/Compression/EncodingUtil.cpp create mode 100644 dbms/src/IO/Compression/EncodingUtil.h diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 44da3f050bf..a7a60040260 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -104,8 +104,7 @@ check_then_add_sources_compile_flag ( src/Columns/ColumnVector.cpp src/DataTypes/DataTypeString.cpp src/Interpreters/Join.cpp - src/IO/Compression/CompressionCodecFOR.cpp - src/IO/Compression/CompressionCodecDeltaFOR.cpp + src/IO/Compression/EncodingUtil.cpp src/Storages/DeltaMerge/BitmapFilter/BitmapFilter.cpp src/Storages/DeltaMerge/DMVersionFilterBlockInputStream.cpp ) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index f449f71e67f..d2bcbbe3262 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -17,14 +17,11 @@ #include #include #include +#include #include #include -#if defined(__AVX2__) -#include -#endif - namespace DB { @@ -56,148 +53,16 @@ UInt32 CompressionCodecDeltaFOR::getMaxCompressedDataSize(UInt32 uncompressed_si namespace { -template -void DeltaEncode(const T * source, UInt32 count, T * dest) -{ - T prev = 0; - for (UInt32 i = 0; i < count; ++i) - { - T curr = source[i]; - dest[i] = curr - prev; - prev = curr; - } -} - template UInt32 compressData(const char * source, UInt32 source_size, char * dest) { const auto count = source_size / sizeof(T); - DeltaEncode(reinterpret_cast(source), count, reinterpret_cast(dest)); + DB::Compression::DeltaEncoding(reinterpret_cast(source), count, reinterpret_cast(dest)); // Cast deltas to signed type to better compress negative values. + // For example, if we have a sequence of UInt8 values [3, 2, 1, 0], the deltas will be [3, -1, -1, -1] + // If we compress them as UInt8, we will get [3, 255, 255, 255], which is not optimal. using TS = typename std::make_signed::type; - return CompressionCodecFOR::compressData(reinterpret_cast(dest), count, dest); -} - -template -void ordinaryDeltaDecode(const char * source, UInt32 source_size, char * dest) -{ - T accumulator{}; - const char * const source_end = source + source_size; - while (source < source_end) - { - accumulator += unalignedLoad(source); - unalignedStore(dest, accumulator); - - source += sizeof(T); - dest += sizeof(T); - } -} - -template -void DeltaDecode(const char * source, UInt32 source_size, char * dest) -{ - ordinaryDeltaDecode(source, source_size, dest); -} - -#if defined(__AVX2__) -// Note: using SIMD to rewrite compress does not improve performance. - -template <> -void DeltaDecode(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) -{ - const auto * source = reinterpret_cast(raw_source); - auto source_size = raw_source_size / sizeof(UInt32); - auto * dest = reinterpret_cast(raw_dest); - __m128i prev = _mm_setzero_si128(); - size_t i = 0; - for (; i < source_size / 4; i++) - { - auto curr = _mm_lddqu_si128(reinterpret_cast(source) + i); - const auto tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); - const auto tmp2 = _mm_add_epi32(_mm_slli_si128(tmp1, 4), tmp1); - prev = _mm_add_epi32(tmp2, _mm_shuffle_epi32(prev, 0xff)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + i, prev); - } - uint32_t lastprev = _mm_extract_epi32(prev, 3); - for (i = 4 * i; i < source_size; ++i) - { - lastprev = lastprev + source[i]; - dest[i] = lastprev; - } -} - -template <> -void DeltaDecode(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) -{ - const auto * source = reinterpret_cast(raw_source); - auto source_size = raw_source_size / sizeof(UInt64); - auto * dest = reinterpret_cast(raw_dest); - // AVX2 does not support shffule across 128-bit lanes, so we need to use permute. - __m256i prev = _mm256_setzero_si256(); - __m256i zero = _mm256_setzero_si256(); - size_t i = 0; - for (; i < source_size / 4; ++i) - { - // curr = {a0, a1, a2, a3} - auto curr = _mm256_loadu_si256(reinterpret_cast(source) + i); - // x0 = {0, a0, a1, a2} - auto x0 = _mm256_blend_epi32(_mm256_permute4x64_epi64(curr, 0b10010011), zero, 0b00000011); - // x1 = {a0, a01, a12, a23} - auto x1 = _mm256_add_epi64(curr, x0); - // x2 = {0, 0, a0, a01} - auto x2 = _mm256_permute2f128_si256(x1, x1, 0b00101000); - // prev = prev + {a0, a01, a012, a0123} - prev = _mm256_add_epi64(prev, _mm256_add_epi64(x1, x2)); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest) + i, prev); - // prev = {prev[3], prev[3], prev[3], prev[3]} - prev = _mm256_permute4x64_epi64(prev, 0b11111111); - } - UInt64 lastprev = _mm256_extract_epi64(prev, 3); - for (i = 4 * i; i < source_size; ++i) - { - lastprev += source[i]; - dest[i] = lastprev; - } -} - -#endif - -template -void ordinaryDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - using TS = typename std::make_signed::type; - CompressionCodecFOR::decompressData(source, source_size, dest, output_size); - ordinaryDeltaDecode(dest, output_size, dest); -} - -template -void decompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - ordinaryDecompressData(source, source_size, dest, output_size); -} - -template <> -void decompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - const auto count = output_size / sizeof(UInt32); - auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); - // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt32); - char tmp_buffer[required_size]; - CompressionCodecFOR::decompressData(source, source_size, tmp_buffer, required_size); - DeltaDecode(reinterpret_cast(tmp_buffer), output_size, dest); -} - -template <> -void decompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - const auto count = output_size / sizeof(UInt64); - const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); - // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt64); - char tmp_buffer[required_size]; - CompressionCodecFOR::decompressData(source, source_size, tmp_buffer, required_size); - DeltaDecode(reinterpret_cast(tmp_buffer), output_size, dest); + return DB::CompressionCodecFOR::compressData(reinterpret_cast(dest), count, dest); } } // namespace @@ -249,16 +114,16 @@ void CompressionCodecDeltaFOR::doDecompressData( switch (bytes_size) { case 1: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 2: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 4: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 8: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: throw Exception( @@ -293,16 +158,16 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( switch (bytes_size) { case 1: - ordinaryDecompressData(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 2: - ordinaryDecompressData(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 4: - ordinaryDecompressData(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 8: - ordinaryDecompressData(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); break; default: throw Exception( diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index af5e46c99c2..86b21719744 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -16,12 +16,10 @@ #include #include #include +#include #include #include -#if defined(__AVX2__) -#include -#endif namespace DB { @@ -58,98 +56,8 @@ UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 count, char * std::vector values(count); values.assign(source, source + count); T frame_of_reference = *std::min_element(values.cbegin(), values.cend()); - // store frame of reference - unalignedStore(dest, frame_of_reference); - dest += sizeof(T); - if (frame_of_reference != 0) - { - for (auto & value : values) - value -= frame_of_reference; - } - T max_value = *std::max_element(values.cbegin(), values.cend()); - UInt8 width = BitpackingPrimitives::minimumBitWidth(max_value); - // store width - unalignedStore(dest, width); - dest += sizeof(UInt8); - // if width == 0, skip bitpacking - if (width == 0) - return sizeof(T) + sizeof(UInt8); - auto required_size = BitpackingPrimitives::getRequiredSize(count, width); - // after applying frame of reference, all values are bigger than 0. - BitpackingPrimitives::packBuffer(reinterpret_cast(dest), values.data(), count, width); - return sizeof(T) + sizeof(UInt8) + required_size; -} - -template -void CompressionCodecFOR::decompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - const auto count = output_size / sizeof(T); - T frame_of_reference = unalignedLoad(source); - source += sizeof(T); - auto width = unalignedLoad(source); - source += sizeof(UInt8); - const auto required_size = source_size - sizeof(T) - sizeof(UInt8); - RUNTIME_CHECK(BitpackingPrimitives::getRequiredSize(count, width) == required_size); - auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); - if (round_size != count) - { - // Reserve enough space for the temporary buffer. - unsigned char tmp_buffer[round_size * sizeof(T)]; - BitpackingPrimitives::unPackBuffer( - tmp_buffer, - reinterpret_cast(source), - count, - width); - CompressionCodecFOR::applyFrameOfReference(reinterpret_cast(tmp_buffer), frame_of_reference, count); - memcpy(dest, tmp_buffer, output_size); - return; - } - BitpackingPrimitives::unPackBuffer( - reinterpret_cast(dest), - reinterpret_cast(source), - count, - width); - CompressionCodecFOR::applyFrameOfReference(reinterpret_cast(dest), frame_of_reference, count); -} - -template -void CompressionCodecFOR::applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count) -{ - if (frame_of_reference == 0) - return; - - UInt32 i = 0; -#if defined(__AVX2__) - UInt32 aligned_count = count - count % (sizeof(__m256i) / sizeof(T)); - for (; i < aligned_count; i += (sizeof(__m256i) / sizeof(T))) - { - // Load the data using SIMD - __m256i value = _mm256_loadu_si256(reinterpret_cast<__m256i *>(dst + i)); - // Perform vectorized addition - if constexpr (sizeof(T) == 1) - { - value = _mm256_add_epi8(value, _mm256_set1_epi8(frame_of_reference)); - } - else if constexpr (sizeof(T) == 2) - { - value = _mm256_add_epi16(value, _mm256_set1_epi16(frame_of_reference)); - } - else if constexpr (sizeof(T) == 4) - { - value = _mm256_add_epi32(value, _mm256_set1_epi32(frame_of_reference)); - } - else if constexpr (sizeof(T) == 8) - { - value = _mm256_add_epi64(value, _mm256_set1_epi64x(frame_of_reference)); - } - // Store the result back to memory - _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + i), value); - } -#endif - for (; i < count; ++i) - { - dst[i] += frame_of_reference; - } + UInt8 width = DB::Compression::ForEncodingWidth(values, frame_of_reference); + return DB::Compression::ForEncoding>(values, frame_of_reference, width, dest); } UInt32 CompressionCodecFOR::doCompressData(const char * source, UInt32 source_size, char * dest) const @@ -200,16 +108,16 @@ void CompressionCodecFOR::doDecompressData( switch (bytes_size) { case 1: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 2: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 4: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 8: - decompressData(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: throw Exception( @@ -227,25 +135,4 @@ template UInt32 CompressionCodecFOR::compressData(const Int16 * source, U template UInt32 CompressionCodecFOR::compressData(const Int32 * source, UInt32 count, char * dest); template UInt32 CompressionCodecFOR::compressData(const Int64 * source, UInt32 count, char * dest); -template void CompressionCodecFOR::decompressData( - const char * source, - UInt32 source_size, - char * dest, - UInt32 output_size); -template void CompressionCodecFOR::decompressData( - const char * source, - UInt32 source_size, - char * dest, - UInt32 output_size); -template void CompressionCodecFOR::decompressData( - const char * source, - UInt32 source_size, - char * dest, - UInt32 output_size); -template void CompressionCodecFOR::decompressData( - const char * source, - UInt32 source_size, - char * dest, - UInt32 output_size); - } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.h b/dbms/src/IO/Compression/CompressionCodecFOR.h index 38798b3d8d2..75dd8b91734 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecFOR.h @@ -34,15 +34,9 @@ class CompressionCodecFOR : public ICompressionCodec UInt8 getMethodByte() const override; - template - static void applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); - template static UInt32 compressData(const T * source, UInt32 count, char * dest); - template - static void decompressData(const char * source, UInt32 source_size, char * dest, UInt32 output_size); - #ifndef DBMS_PUBLIC_GTEST protected: #endif diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp new file mode 100644 index 00000000000..9dd9445a53a --- /dev/null +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp @@ -0,0 +1,367 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +namespace DB +{ + +// TODO: metrics + +namespace ErrorCodes +{ +extern const int CANNOT_COMPRESS; +extern const int CANNOT_DECOMPRESS; +} // namespace ErrorCodes + +CompressionCodecIntegerLightweight::CompressionCodecIntegerLightweight(UInt8 bytes_size_) + : bytes_size(bytes_size_) +{} + +UInt8 CompressionCodecIntegerLightweight::getMethodByte() const +{ + return static_cast(CompressionMethodByte::Lightweight); +} + +UInt32 CompressionCodecIntegerLightweight::getMaxCompressedDataSize(UInt32 uncompressed_size) const +{ + // 1 byte for bytes_size, 1 byte for mode, and the rest for compressed data + return 1 + 1 + uncompressed_size; +} + +template +size_t CompressionCodecIntegerLightweight::compressDataForType(const char * source, UInt32 source_size, char * dest) + const +{ + if (source_size % sizeof(T) != 0) + throw Exception( + ErrorCodes::CANNOT_COMPRESS, + "Cannot compress with lightweight codec, data size {} is not aligned to {}", + source_size, + sizeof(T)); + + // Load values + const size_t count = source_size / sizeof(T); + std::vector values(count); + for (size_t i = 0; i < count; ++i) + { + values[i] = unalignedLoad(source + i * sizeof(T)); + } + + // Analyze + State state; + ctx.analyze(values, state); + + // Compress + unalignedStore(dest, static_cast(ctx.mode)); + dest += sizeof(UInt8); + size_t compressed_size = 1; + switch (ctx.mode) + { + case Mode::CONSTANT: + { + compressed_size += Compression::ConstantEncoding(std::get<0>(state), dest); + break; + } + case Mode::CONSTANT_DELTA: + { + compressed_size += Compression::ConstantDeltaEncoding(values[0], std::get<0>(state), dest); + break; + } + case Mode::RLE: + { + compressed_size += Compression::RLEEncoding(std::get<1>(state), dest); + break; + } + case Mode::FOR: + { + FORState for_state = std::get<2>(state); + compressed_size += Compression::ForEncoding(values, for_state.min_value, for_state.bit_width, dest); + break; + } + case Mode::DELTA_FOR: + { + DeltaFORState delta_for_state = std::get<3>(state); + compressed_size += Compression::ForEncoding, true>( + delta_for_state.deltas, + delta_for_state.min_delta_value, + delta_for_state.bit_width, + dest); + break; + } + case Mode::LZ4: + { + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + compressed_size += success; + break; + } + default: + throw Exception( + ErrorCodes::CANNOT_COMPRESS, + "Cannot compress with lightweight codec, unknown mode {}", + static_cast(ctx.mode)); + } + + // Update statistics + ctx.update(source_size, compressed_size); + + return compressed_size; +} + +template +void CompressionCodecIntegerLightweight::decompressDataForType( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) const +{ + auto mode = static_cast(unalignedLoad(source)); + source += sizeof(UInt8); + source_size -= sizeof(UInt8); + switch (mode) + { + case Mode::CONSTANT: + Compression::ConstantDecoding(source, source_size, dest, output_size); + break; + case Mode::CONSTANT_DELTA: + Compression::ConstantDeltaDecoding(source, source_size, dest, output_size); + break; + case Mode::RLE: + Compression::RLEDecoding(source, source_size, dest, output_size); + break; + case Mode::FOR: + Compression::ForDecoding(source, source_size, dest, output_size); + break; + case Mode::DELTA_FOR: + Compression::DeltaForDecoding(source, source_size, dest, output_size); + break; + case Mode::LZ4: + if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); + break; + default: + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress with lightweight codec, unknown mode {}", + static_cast(mode)); + } +} + +void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompressed_size, size_t compressed_size) +{ + if (mode == Mode::LZ4) + { + lz4_uncompressed_size += uncompressed_size; + lz4_compressed_size += compressed_size; + ++lz4_counter; + } + else + { + lw_uncompressed_size += uncompressed_size; + lw_compressed_size += compressed_size; + ++lw_counter; + } +} + +bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const +{ + // lightweight codec is never used, do not analyze anymore + if (lz4_counter > 5 && lw_counter == 0) + return false; + // if lz4 is used more than 5 times and the compression ratio is better than lightweight codec, do not analyze anymore + if (lz4_counter > 5 && lz4_uncompressed_size / lz4_compressed_size > lw_compressed_size / lw_uncompressed_size) + return false; + return true; +} + +template +void CompressionCodecIntegerLightweight::CompressContext::analyze(std::vector & values, State & state) +{ + if (!needAnalyze()) + return; + + if (values.empty()) + { + mode = Mode::Invalid; + return; + } + + // Check CONSTANT + std::vector> rle; + rle.reserve(values.size()); + rle.emplace_back(values[0], 1); + for (size_t i = 1; i < values.size(); ++i) + { + if (values[i] != values[i - 1] || rle.back().second == std::numeric_limits::max()) + rle.emplace_back(values[i], 1); + else + ++rle.back().second; + } + T min_value = *std::min_element(values.cbegin(), values.cend()); + T max_value = *std::max_element(values.cbegin(), values.cend()); + if (rle.size() == 1) + { + state = rle[0].first; + mode = Mode::CONSTANT; + return; + } + + // Check CONSTANT_DELTA + using TS = std::make_signed_t; + std::vector deltas; + deltas.reserve(values.size()); + deltas.push_back(values[0]); + for (size_t i = 1; i < values.size(); ++i) + { + deltas.push_back(values[i] - values[i - 1]); + } + TS min_delta = *std::min_element(deltas.cbegin(), deltas.cend()); + TS max_delta = *std::max_element(deltas.cbegin(), deltas.cend()); + if (min_delta == max_delta) + { + state = static_cast(min_delta); + mode = Mode::CONSTANT_DELTA; + return; + } + + UInt8 delta_for_width = Compression::ForEncodingWidth(deltas, min_delta); + // additional T bytes for min_delta, and 1 byte for width + size_t delta_for_size + = BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8); + UInt8 for_width = BitpackingPrimitives::minimumBitWidth(max_value - min_value); + // additional T bytes for min_value, and 1 byte for width + size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + sizeof(T) + sizeof(UInt8); + size_t origin_size = values.size() * sizeof(T); + size_t rle_size = Compression::RLEPairsSize(rle); + if (rle_size < delta_for_size && rle_size < for_size && rle_size < origin_size) + { + state = std::move(rle); + mode = Mode::RLE; + } + else if (for_size < delta_for_size && for_size < origin_size) + { + state = FORState{min_value, for_width}; + mode = Mode::FOR; + } + else if (delta_for_size < origin_size) + { + state = DeltaFORState{deltas, min_delta, delta_for_width}; + mode = Mode::DELTA_FOR; + } + else + { + mode = Mode::LZ4; + } +} + +UInt32 CompressionCodecIntegerLightweight::doCompressData(const char * source, UInt32 source_size, char * dest) const +{ + if unlikely (source_size % bytes_size != 0) + throw Exception( + ErrorCodes::CANNOT_COMPRESS, + "Cannot compress with lightweight codec, data size {} is not aligned to {}", + source_size, + bytes_size); + + dest[0] = bytes_size; + dest += 1; + switch (bytes_size) + { + case 1: + return 1 + compressDataForType(source, source_size, dest); + case 2: + return 1 + compressDataForType(source, source_size, dest); + case 4: + return 1 + compressDataForType(source, source_size, dest); + case 8: + return 1 + compressDataForType(source, source_size, dest); + default: + throw Exception( + ErrorCodes::CANNOT_COMPRESS, + "Cannot compress with lightweight codec, unknown bytes size {}", + bytes_size); + } +} + +void CompressionCodecIntegerLightweight::doDecompressData( + const char * source, + UInt32 source_size, + char * dest, + UInt32 uncompressed_size) const +{ + if unlikely (source_size < 2) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight-encoded data. File has wrong header"); + + if (uncompressed_size == 0) + return; + + UInt8 bytes_size = source[0]; + + if unlikely (bytes_size != 1 && bytes_size != 2 && bytes_size != 4 && bytes_size != 8) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight-encoded data. File has wrong header"); + + if unlikely (uncompressed_size % bytes_size != 0) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight-encoded data. Uncompressed size {} is not aligned to {}", + uncompressed_size, + bytes_size); + + UInt32 source_size_no_header = source_size - 1; + switch (bytes_size) + { + case 1: + decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case 2: + decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case 4: + decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case 8: + decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + break; + default: + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot compress with lightweight codec, unknown bytes size {}", + bytes_size); + } +} + +} // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h new file mode 100644 index 00000000000..bbc7f8a7191 --- /dev/null +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h @@ -0,0 +1,111 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace DB +{ + +class CompressionCodecIntegerLightweight : public ICompressionCodec +{ +public: + explicit CompressionCodecIntegerLightweight(UInt8 bytes_size_); + + UInt8 getMethodByte() const override; + +protected: + UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; + void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) + const override; + + UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; + + bool isCompression() const override { return true; } // light compression + bool isGenericCompression() const override { return false; } + +private: + enum class Mode : UInt8 + { + Invalid = 0, + CONSTANT = 1, // all values are the same + CONSTANT_DELTA = 2, // the difference between two adjacent values is the same + RLE = 3, // run-length encoding + FOR = 4, // Frame of Reference encoding + DELTA_FOR = 5, // delta encoding and then FOR encoding + LZ4 = 6, // the above modes are not suitable, use LZ4 instead + }; + + // Constant or ConstantDelta + template + using ConstantState = T; + + template + using RLEState = std::vector>; + + template + struct FORState + { + T min_value; + UInt8 bit_width; + }; + + template + struct DeltaFORState + { + using TS = typename std::make_signed_t; + std::vector deltas; + TS min_delta_value; + UInt8 bit_width; + }; + + // State is a union of different states for different modes + template + using State = std::variant, RLEState, FORState, DeltaFORState>; + + class CompressContext + { + public: + CompressContext() = default; + + bool needAnalyze() const; + + template + void analyze(std::vector & values, State & state); + + void update(size_t uncompressed_size, size_t compressed_size); + + Mode mode = Mode::LZ4; + + private: + size_t lw_uncompressed_size = 0; + size_t lw_compressed_size = 0; + size_t lw_counter = 0; + size_t lz4_uncompressed_size = 0; + size_t lz4_compressed_size = 0; + size_t lz4_counter = 0; + }; + + template + size_t compressDataForType(const char * source, UInt32 source_size, char * dest) const; + + template + void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size) const; + + mutable CompressContext ctx; + const UInt8 bytes_size; +}; + +} // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecRLE.cpp b/dbms/src/IO/Compression/CompressionCodecRLE.cpp index c16d7535f7d..27ddd53c4a7 100644 --- a/dbms/src/IO/Compression/CompressionCodecRLE.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRLE.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -48,24 +49,23 @@ namespace { constexpr UInt8 JUST_COPY_CODE = 0xFF; -// TODO: better implementation -template +template UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) { const char * source_end = source + source_size; - std::vector> rle_vec; + DB::Compression::RLEPairs rle_vec; rle_vec.reserve(source_size / sizeof(T)); - static constexpr size_t RLE_PAIR_LENGTH = sizeof(T) + sizeof(UInt16); for (const auto * src = source; src < source_end; src += sizeof(T)) { T value = unalignedLoad(src); - if (rle_vec.empty() || rle_vec.back().first != value) + if (rle_vec.empty() || rle_vec.back().first != value + || rle_vec.back().second == std::numeric_limits::max()) rle_vec.emplace_back(value, 1); else ++rle_vec.back().second; } - if (rle_vec.size() * RLE_PAIR_LENGTH > source_size) + if (DB::Compression::RLEPairsSize(rle_vec) > source_size) { dest[0] = JUST_COPY_CODE; memcpy(&dest[1], source, source_size); @@ -74,42 +74,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) dest[0] = sizeof(T); dest += 1; - for (const auto & [value, count] : rle_vec) - { - unalignedStore(dest, value); - dest += sizeof(T); - unalignedStore(dest, count); - dest += sizeof(UInt16); - } - return 1 + rle_vec.size() * RLE_PAIR_LENGTH; -} - -template -void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size) -{ - const char * output_end = dest + output_size; - const char * source_end = source + source_size; - - UInt8 bytes_size = source[0]; - RUNTIME_CHECK(bytes_size == sizeof(T), bytes_size, sizeof(T)); - source += 1; - - while (source < source_end) - { - T data = unalignedLoad(source); - source += sizeof(T); - auto count = unalignedLoad(source); - source += sizeof(UInt16); - if unlikely (dest + count * sizeof(T) > output_end) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress RLE-encoded data, output buffer is too small"); - for (UInt16 i = 0; i < count; ++i) - { - unalignedStore(dest, data); - dest += sizeof(T); - } - } + return 1 + DB::Compression::RLEEncoding(rle_vec, dest); } } // namespace @@ -165,16 +130,16 @@ void CompressionCodecRLE::doDecompressData( switch (bytes_size) { case 1: - decompressDataForType(source, source_size, dest, uncompressed_size); + DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 2: - decompressDataForType(source, source_size, dest, uncompressed_size); + DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 4: - decompressDataForType(source, source_size, dest, uncompressed_size); + DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 8: - decompressDataForType(source, source_size, dest, uncompressed_size); + DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; default: throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress RLE-encoded data. Unsupported bytes size"); diff --git a/dbms/src/IO/Compression/CompressionFactory.h b/dbms/src/IO/Compression/CompressionFactory.h index aac621b42ad..8e1646f5550 100644 --- a/dbms/src/IO/Compression/CompressionFactory.h +++ b/dbms/src/IO/Compression/CompressionFactory.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,8 @@ class CompressionFactory return std::make_unique(setting.level); case CompressionMethod::ZSTD: return std::make_unique(setting.level); + case CompressionMethod::Lightweight: + return std::make_unique(setting.type_bytes_size); #if USE_QPL case CompressionMethod::QPL: return std::make_unique(); diff --git a/dbms/src/IO/Compression/CompressionInfo.h b/dbms/src/IO/Compression/CompressionInfo.h index 2d2a3d9e190..fc9866635c9 100644 --- a/dbms/src/IO/Compression/CompressionInfo.h +++ b/dbms/src/IO/Compression/CompressionInfo.h @@ -60,6 +60,7 @@ enum class CompressionMethodByte : UInt8 DeltaFOR = 0x92, RLE = 0x93, FOR = 0x94, + Lightweight = 0x95, // COL_END is not a compreesion method, but a flag of column end used in compact file. COL_END = 0x66, }; diff --git a/dbms/src/IO/Compression/CompressionMethod.h b/dbms/src/IO/Compression/CompressionMethod.h index 88f64edb9ed..21c6a3ca007 100644 --- a/dbms/src/IO/Compression/CompressionMethod.h +++ b/dbms/src/IO/Compression/CompressionMethod.h @@ -25,6 +25,7 @@ enum class CompressionMethod ZSTD = 3, /// Experimental algorithm: https://github.com/Cyan4973/zstd QPL = 4, /// The Intel Query Processing Library (QPL) is an open-source library to provide high-performance query processing operations NONE = 5, /// No compression + Lightweight = 6, /// Lightweight compression }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index ce9df0ba06f..2f90eee5019 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -31,6 +31,7 @@ constexpr CompressionMethodByte method_byte_map[] = { CompressionMethodByte::ZSTD, // ZSTD CompressionMethodByte::QPL, // QPL CompressionMethodByte::NONE, // NONE + CompressionMethodByte::Lightweight, // Lightweight }; const std::unordered_map method_map = { @@ -41,6 +42,7 @@ const std::unordered_map method_map = {CompressionMethodByte::DeltaFOR, CompressionMethod::NONE}, {CompressionMethodByte::RLE, CompressionMethod::NONE}, {CompressionMethodByte::FOR, CompressionMethod::NONE}, + {CompressionMethodByte::Lightweight, CompressionMethod::Lightweight}, }; struct CompressionSetting diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp new file mode 100644 index 00000000000..331d3144959 --- /dev/null +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -0,0 +1,253 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "EncodingUtil.h" + +#if defined(__AVX2__) +#include +#endif + +namespace DB::Compression +{ + +template +void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count) +{ + if (frame_of_reference == 0) + return; + + UInt32 i = 0; +#if defined(__AVX2__) + UInt32 aligned_count = count - count % (sizeof(__m256i) / sizeof(T)); + for (; i < aligned_count; i += (sizeof(__m256i) / sizeof(T))) + { + // Load the data using SIMD + __m256i value = _mm256_loadu_si256(reinterpret_cast<__m256i *>(dst + i)); + // Perform vectorized addition + if constexpr (sizeof(T) == 1) + { + value = _mm256_add_epi8(value, _mm256_set1_epi8(frame_of_reference)); + } + else if constexpr (sizeof(T) == 2) + { + value = _mm256_add_epi16(value, _mm256_set1_epi16(frame_of_reference)); + } + else if constexpr (sizeof(T) == 4) + { + value = _mm256_add_epi32(value, _mm256_set1_epi32(frame_of_reference)); + } + else if constexpr (sizeof(T) == 8) + { + value = _mm256_add_epi64(value, _mm256_set1_epi64x(frame_of_reference)); + } + // Store the result back to memory + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + i), value); + } +#endif + for (; i < count; ++i) + { + dst[i] += frame_of_reference; + } +} + +template void ApplyFrameOfReference(UInt8 *, UInt8, UInt32); +template void ApplyFrameOfReference(UInt16 *, UInt16, UInt32); +template void ApplyFrameOfReference(UInt32 *, UInt32, UInt32); +template void ApplyFrameOfReference(UInt64 *, UInt64, UInt32); + +template +void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count) +{ + if (frame_of_reference == 0) + return; + + UInt32 i = 0; +#if defined(__AVX2__) + UInt32 aligned_count = count - count % (sizeof(__m256i) / sizeof(T)); + for (; i < aligned_count; i += (sizeof(__m256i) / sizeof(T))) + { + // Load the data using SIMD + __m256i value = _mm256_loadu_si256(reinterpret_cast<__m256i *>(dst + i)); + // Perform vectorized addition + if constexpr (sizeof(T) == 1) + { + value = _mm256_sub_epi8(value, _mm256_set1_epi8(frame_of_reference)); + } + else if constexpr (sizeof(T) == 2) + { + value = _mm256_sub_epi16(value, _mm256_set1_epi16(frame_of_reference)); + } + else if constexpr (sizeof(T) == 4) + { + value = _mm256_sub_epi32(value, _mm256_set1_epi32(frame_of_reference)); + } + else if constexpr (sizeof(T) == 8) + { + value = _mm256_sub_epi64(value, _mm256_set1_epi64x(frame_of_reference)); + } + // Store the result back to memory + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + i), value); + } +#endif + for (; i < count; ++i) + { + dst[i] -= frame_of_reference; + } +} + +template void SubtractFrameOfReference(Int8 *, Int8, UInt32); +template void SubtractFrameOfReference(Int16 *, Int16, UInt32); +template void SubtractFrameOfReference(Int32 *, Int32, UInt32); +template void SubtractFrameOfReference(Int64 *, Int64, UInt32); +template void SubtractFrameOfReference(UInt8 *, UInt8, UInt32); +template void SubtractFrameOfReference(UInt16 *, UInt16, UInt32); +template void SubtractFrameOfReference(UInt32 *, UInt32, UInt32); +template void SubtractFrameOfReference(UInt64 *, UInt64, UInt32); + +template +UInt8 ForEncodingWidth(std::vector & values, T frame_of_reference) +{ + if constexpr (std::is_signed_v) + { + // For signed types, after subtracting frame of reference, the range of values is not always [0, max_value - min_value]. + // For example, we have a sequence of Int8 values [-128, 1, 127], after subtracting frame of reference -128, the values are [0, -127, -1]. + // The minimum bit width required to store the values is 8 rather than the width of `max_value - min_value = -1`. + // So we need to calculate the minimum bit width of the values after subtracting frame of reference. + SubtractFrameOfReference(values.data(), frame_of_reference, values.size()); + T max_value = *std::max_element(values.cbegin(), values.cend()); + T min_value = *std::min_element(values.cbegin(), values.cend()); + return BitpackingPrimitives::minimumBitWidth(min_value, max_value); + } + else + { + T max_value = *std::max_element(values.cbegin(), values.cend()); + return BitpackingPrimitives::minimumBitWidth(max_value - frame_of_reference); + } +} + +template UInt8 ForEncodingWidth(std::vector &, Int8); +template UInt8 ForEncodingWidth(std::vector &, Int16); +template UInt8 ForEncodingWidth(std::vector &, Int32); +template UInt8 ForEncodingWidth(std::vector &, Int64); +template UInt8 ForEncodingWidth(std::vector &, UInt8); +template UInt8 ForEncodingWidth(std::vector &, UInt16); +template UInt8 ForEncodingWidth(std::vector &, UInt32); +template UInt8 ForEncodingWidth(std::vector &, UInt64); + +template +void DeltaDecoding(const char * source, UInt32 source_size, char * dest) +{ + ordinaryDeltaDecoding(source, source_size, dest); +} + +#if defined(__AVX2__) +// Note: using SIMD to rewrite compress does not improve performance. + +template <> +void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) +{ + const auto * source = reinterpret_cast(raw_source); + auto source_size = raw_source_size / sizeof(UInt32); + auto * dest = reinterpret_cast(raw_dest); + __m128i prev = _mm_setzero_si128(); + size_t i = 0; + for (; i < source_size / 4; i++) + { + auto curr = _mm_lddqu_si128(reinterpret_cast(source) + i); + const auto tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const auto tmp2 = _mm_add_epi32(_mm_slli_si128(tmp1, 4), tmp1); + prev = _mm_add_epi32(tmp2, _mm_shuffle_epi32(prev, 0xff)); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + i, prev); + } + uint32_t lastprev = _mm_extract_epi32(prev, 3); + for (i = 4 * i; i < source_size; ++i) + { + lastprev = lastprev + source[i]; + dest[i] = lastprev; + } +} + +template <> +void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) +{ + const auto * source = reinterpret_cast(raw_source); + auto source_size = raw_source_size / sizeof(UInt64); + auto * dest = reinterpret_cast(raw_dest); + // AVX2 does not support shffule across 128-bit lanes, so we need to use permute. + __m256i prev = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + size_t i = 0; + for (; i < source_size / 4; ++i) + { + // curr = {a0, a1, a2, a3} + auto curr = _mm256_loadu_si256(reinterpret_cast(source) + i); + // x0 = {0, a0, a1, a2} + auto x0 = _mm256_blend_epi32(_mm256_permute4x64_epi64(curr, 0b10010011), zero, 0b00000011); + // x1 = {a0, a01, a12, a23} + auto x1 = _mm256_add_epi64(curr, x0); + // x2 = {0, 0, a0, a01} + auto x2 = _mm256_permute2f128_si256(x1, x1, 0b00101000); + // prev = prev + {a0, a01, a012, a0123} + prev = _mm256_add_epi64(prev, _mm256_add_epi64(x1, x2)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dest) + i, prev); + // prev = {prev[3], prev[3], prev[3], prev[3]} + prev = _mm256_permute4x64_epi64(prev, 0b11111111); + } + UInt64 lastprev = _mm256_extract_epi64(prev, 3); + for (i = 4 * i; i < source_size; ++i) + { + lastprev += source[i]; + dest[i] = lastprev; + } +} + +#endif + +template +void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + static_assert(std::is_integral::value, "Integral required."); + OrdinaryDeltaForDecoding(src, source_size, dest, dest_size); +} + +template <> +void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + const auto count = dest_size / sizeof(UInt32); + auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); + // Reserve enough space for the temporary buffer. + const auto required_size = round_size * sizeof(UInt32); + char tmp_buffer[required_size]; + memset(tmp_buffer, 0, required_size); + ForDecoding(src, source_size, tmp_buffer, required_size); + DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); +} + +template <> +void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + const auto count = dest_size / sizeof(UInt64); + const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); + // Reserve enough space for the temporary buffer. + const auto required_size = round_size * sizeof(UInt64); + char tmp_buffer[required_size]; + memset(tmp_buffer, 0, required_size); + ForDecoding(src, source_size, tmp_buffer, required_size); + DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); +} + +template void DeltaForDecoding(const char *, UInt32, char *, UInt32); +template void DeltaForDecoding(const char *, UInt32, char *, UInt32); + +} // namespace DB::Compression diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h new file mode 100644 index 00000000000..db39960d7ce --- /dev/null +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -0,0 +1,251 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#if defined(__AVX2__) +#include +#endif + +namespace DB::ErrorCodes +{ +extern const int CANNOT_COMPRESS; +extern const int CANNOT_DECOMPRESS; +} // namespace DB::ErrorCodes + +namespace DB::Compression +{ + +/// Constant encoding + +template +size_t ConstantEncoding(T constant, char * dest) +{ + unalignedStore(dest, constant); + return sizeof(T); +} + +template +void ConstantDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + if (source_size < sizeof(T)) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot use Constant decoding, data size {} is too small", + source_size); + + T constant = unalignedLoad(src); + for (size_t i = 0; i < dest_size / sizeof(T); ++i) + { + unalignedStore(dest, constant); + dest += sizeof(T); + } +} + +/// Constant delta encoding + +template +size_t ConstantDeltaEncoding(T first_value, T constant_delta, char * dest) +{ + unalignedStore(dest, first_value); + dest += sizeof(T); + unalignedStore(dest, constant_delta); + return sizeof(T) + sizeof(T); +} + +template +void ConstantDeltaDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + if (source_size < sizeof(T) + sizeof(T)) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot use ConstantDelta decoding, data size {} is too small", + source_size); + + T first_value = unalignedLoad(src); + T constant_delta = unalignedLoad(src + sizeof(T)); + for (size_t i = 0; i < dest_size / sizeof(T); ++i) + { + unalignedStore(dest, first_value); + first_value += constant_delta; + dest += sizeof(T); + } +} + +/// Run-length encoding + +template +using RLEPair = std::pair; +template +using RLEPairs = std::vector>; +template +static constexpr size_t RLEPairLength = sizeof(T) + sizeof(UInt8); + +template +size_t RLEPairsSize(const RLEPairs & rle) +{ + return rle.size() * RLEPairLength; +} + +template +size_t RLEEncoding(const RLEPairs & rle, char * dest) +{ + for (const auto & [value, count] : rle) + { + unalignedStore(dest, value); + dest += sizeof(T); + unalignedStore(dest, count); + dest += sizeof(UInt8); + } + return rle.size() * RLEPairLength; +} + +template +void RLEDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + if unlikely (source_size % RLEPairLength != 0) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot use RLE decoding, data size {} is not aligned to {}", + source_size, + RLEPairLength); + + const char * dest_end = dest + dest_size; + for (UInt32 i = 0; i < source_size / RLEPairLength; ++i) + { + T value = unalignedLoad(src); + src += sizeof(T); + auto count = unalignedLoad(src); + src += sizeof(UInt8); + if (dest + count * sizeof(T) > dest_end) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot use RLE decoding, data is too large"); + for (UInt8 j = 0; j < count; ++j) + { + unalignedStore(dest, value); + dest += sizeof(T); + } + } +} + +/// Frame of Reference encoding + +template +void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count); + +template +UInt8 ForEncodingWidth(std::vector & values, T frame_of_reference); + +template +size_t ForEncoding(std::vector & values, T frame_of_reference, UInt8 width, char * dest) +{ + assert(!values.empty()); + if constexpr (!skip_subtract_frame_of_reference) + SubtractFrameOfReference(values.data(), frame_of_reference, values.size()); + // store frame of reference + unalignedStore(dest, frame_of_reference); + dest += sizeof(T); + // store width + unalignedStore(dest, width); + dest += sizeof(UInt8); + // if width == 0, skip bitpacking + if (width == 0) + return sizeof(T) + sizeof(UInt8); + auto required_size = BitpackingPrimitives::getRequiredSize(values.size(), width); + // after applying frame of reference, all values are bigger than 0. + BitpackingPrimitives::packBuffer(reinterpret_cast(dest), values.data(), values.size(), width); + return sizeof(T) + sizeof(UInt8) + required_size; +} + +template +void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); + +template +void ForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + const auto count = dest_size / sizeof(T); + T frame_of_reference = unalignedLoad(src); + src += sizeof(T); + auto width = unalignedLoad(src); + src += sizeof(UInt8); + const auto required_size = source_size - sizeof(T) - sizeof(UInt8); + RUNTIME_CHECK(BitpackingPrimitives::getRequiredSize(count, width) == required_size); + auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); + if (round_size != count) + { + // Reserve enough space for the temporary buffer. + unsigned char tmp_buffer[round_size * sizeof(T)]; + BitpackingPrimitives::unPackBuffer(tmp_buffer, reinterpret_cast(src), count, width); + ApplyFrameOfReference(reinterpret_cast(tmp_buffer), frame_of_reference, count); + memcpy(dest, tmp_buffer, dest_size); + return; + } + BitpackingPrimitives::unPackBuffer( + reinterpret_cast(dest), + reinterpret_cast(src), + count, + width); + ApplyFrameOfReference(reinterpret_cast(dest), frame_of_reference, count); +} + +/// Delta encoding + +template +void DeltaEncoding(const T * source, UInt32 count, T * dest) +{ + T prev = 0; + for (UInt32 i = 0; i < count; ++i) + { + T curr = source[i]; + dest[i] = curr - prev; + prev = curr; + } +} + +template +void ordinaryDeltaDecoding(const char * source, UInt32 source_size, char * dest) +{ + T accumulator{}; + const char * const source_end = source + source_size; + while (source < source_end) + { + accumulator += unalignedLoad(source); + unalignedStore(dest, accumulator); + + source += sizeof(T); + dest += sizeof(T); + } +} + +template +void DeltaDecoding(const char * source, UInt32 source_size, char * dest); + +/// Delta + Frame of Reference encoding + +template +void OrdinaryDeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +{ + using TS = typename std::make_signed_t; + ForDecoding(src, source_size, dest, dest_size); + ordinaryDeltaDecoding(dest, dest_size, dest); +} + +template +void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size); + +} // namespace DB::Compression diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index 8beb29f22df..76781a3ac10 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -532,6 +532,7 @@ std::vector generatePyramidOfSequences( #define G(generator) generator, #generator const auto IntegerCodecsToTest = ::testing::Values( + CompressionMethodByte::Lightweight, CompressionMethodByte::DeltaFOR, CompressionMethodByte::FOR, CompressionMethodByte::RLE From c583557c65da662909a2b69d41c19fba993292e5 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Thu, 9 May 2024 13:54:12 +0800 Subject: [PATCH 02/27] rename Signed-off-by: Lloyd-Pottiger --- .../Compression/CompressionCodecDeltaFOR.cpp | 16 ++++----- .../IO/Compression/CompressionCodecFOR.cpp | 12 +++---- .../CompressionCodecIntegerLightweight.cpp | 10 +++--- dbms/src/IO/Compression/EncodingUtil.cpp | 34 +++++++++---------- dbms/src/IO/Compression/EncodingUtil.h | 12 +++---- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index d2bcbbe3262..73880a3424f 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -114,16 +114,16 @@ void CompressionCodecDeltaFOR::doDecompressData( switch (bytes_size) { case 1: - DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 2: - DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 4: - DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 8: - DB::Compression::DeltaForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: throw Exception( @@ -158,16 +158,16 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( switch (bytes_size) { case 1: - DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 2: - DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 4: - DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 8: - DB::Compression::OrdinaryDeltaForDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; default: throw Exception( diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index 86b21719744..db3b7511bb0 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -56,8 +56,8 @@ UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 count, char * std::vector values(count); values.assign(source, source + count); T frame_of_reference = *std::min_element(values.cbegin(), values.cend()); - UInt8 width = DB::Compression::ForEncodingWidth(values, frame_of_reference); - return DB::Compression::ForEncoding>(values, frame_of_reference, width, dest); + UInt8 width = DB::Compression::FOREncodingWidth(values, frame_of_reference); + return DB::Compression::FOREncoding>(values, frame_of_reference, width, dest); } UInt32 CompressionCodecFOR::doCompressData(const char * source, UInt32 source_size, char * dest) const @@ -108,16 +108,16 @@ void CompressionCodecFOR::doDecompressData( switch (bytes_size) { case 1: - DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 2: - DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 4: - DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 8: - DB::Compression::ForDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: throw Exception( diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp index 9dd9445a53a..c0ebe0bdb69 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp @@ -99,13 +99,13 @@ size_t CompressionCodecIntegerLightweight::compressDataForType(const char * sour case Mode::FOR: { FORState for_state = std::get<2>(state); - compressed_size += Compression::ForEncoding(values, for_state.min_value, for_state.bit_width, dest); + compressed_size += Compression::FOREncoding(values, for_state.min_value, for_state.bit_width, dest); break; } case Mode::DELTA_FOR: { DeltaFORState delta_for_state = std::get<3>(state); - compressed_size += Compression::ForEncoding, true>( + compressed_size += Compression::FOREncoding, true>( delta_for_state.deltas, delta_for_state.min_delta_value, delta_for_state.bit_width, @@ -160,10 +160,10 @@ void CompressionCodecIntegerLightweight::decompressDataForType( Compression::RLEDecoding(source, source_size, dest, output_size); break; case Mode::FOR: - Compression::ForDecoding(source, source_size, dest, output_size); + Compression::FORDecoding(source, source_size, dest, output_size); break; case Mode::DELTA_FOR: - Compression::DeltaForDecoding(source, source_size, dest, output_size); + Compression::DeltaFORDecoding(source, source_size, dest, output_size); break; case Mode::LZ4: if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) @@ -254,7 +254,7 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::vector return; } - UInt8 delta_for_width = Compression::ForEncodingWidth(deltas, min_delta); + UInt8 delta_for_width = Compression::FOREncodingWidth(deltas, min_delta); // additional T bytes for min_delta, and 1 byte for width size_t delta_for_size = BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8); diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index 331d3144959..db24b7fa68b 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -116,7 +116,7 @@ template void SubtractFrameOfReference(UInt32 *, UInt32, UInt32); template void SubtractFrameOfReference(UInt64 *, UInt64, UInt32); template -UInt8 ForEncodingWidth(std::vector & values, T frame_of_reference) +UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference) { if constexpr (std::is_signed_v) { @@ -136,14 +136,14 @@ UInt8 ForEncodingWidth(std::vector & values, T frame_of_reference) } } -template UInt8 ForEncodingWidth(std::vector &, Int8); -template UInt8 ForEncodingWidth(std::vector &, Int16); -template UInt8 ForEncodingWidth(std::vector &, Int32); -template UInt8 ForEncodingWidth(std::vector &, Int64); -template UInt8 ForEncodingWidth(std::vector &, UInt8); -template UInt8 ForEncodingWidth(std::vector &, UInt16); -template UInt8 ForEncodingWidth(std::vector &, UInt32); -template UInt8 ForEncodingWidth(std::vector &, UInt64); +template UInt8 FOREncodingWidth(std::vector &, Int8); +template UInt8 FOREncodingWidth(std::vector &, Int16); +template UInt8 FOREncodingWidth(std::vector &, Int32); +template UInt8 FOREncodingWidth(std::vector &, Int64); +template UInt8 FOREncodingWidth(std::vector &, UInt8); +template UInt8 FOREncodingWidth(std::vector &, UInt16); +template UInt8 FOREncodingWidth(std::vector &, UInt32); +template UInt8 FOREncodingWidth(std::vector &, UInt64); template void DeltaDecoding(const char * source, UInt32 source_size, char * dest) @@ -215,14 +215,14 @@ void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_sour #endif template -void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { static_assert(std::is_integral::value, "Integral required."); - OrdinaryDeltaForDecoding(src, source_size, dest, dest_size); + OrdinaryDeltaFORDecoding(src, source_size, dest, dest_size); } template <> -void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { const auto count = dest_size / sizeof(UInt32); auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); @@ -230,12 +230,12 @@ void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, const auto required_size = round_size * sizeof(UInt32); char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); - ForDecoding(src, source_size, tmp_buffer, required_size); + FORDecoding(src, source_size, tmp_buffer, required_size); DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } template <> -void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { const auto count = dest_size / sizeof(UInt64); const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); @@ -243,11 +243,11 @@ void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, const auto required_size = round_size * sizeof(UInt64); char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); - ForDecoding(src, source_size, tmp_buffer, required_size); + FORDecoding(src, source_size, tmp_buffer, required_size); DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } -template void DeltaForDecoding(const char *, UInt32, char *, UInt32); -template void DeltaForDecoding(const char *, UInt32, char *, UInt32); +template void DeltaFORDecoding(const char *, UInt32, char *, UInt32); +template void DeltaFORDecoding(const char *, UInt32, char *, UInt32); } // namespace DB::Compression diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index db39960d7ce..e07a8555304 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -149,10 +149,10 @@ template void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template -UInt8 ForEncodingWidth(std::vector & values, T frame_of_reference); +UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference); template -size_t ForEncoding(std::vector & values, T frame_of_reference, UInt8 width, char * dest) +size_t FOREncoding(std::vector & values, T frame_of_reference, UInt8 width, char * dest) { assert(!values.empty()); if constexpr (!skip_subtract_frame_of_reference) @@ -176,7 +176,7 @@ template void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template -void ForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { const auto count = dest_size / sizeof(T); T frame_of_reference = unalignedLoad(src); @@ -238,14 +238,14 @@ void DeltaDecoding(const char * source, UInt32 source_size, char * dest); /// Delta + Frame of Reference encoding template -void OrdinaryDeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void OrdinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { using TS = typename std::make_signed_t; - ForDecoding(src, source_size, dest, dest_size); + FORDecoding(src, source_size, dest, dest_size); ordinaryDeltaDecoding(dest, dest_size, dest); } template -void DeltaForDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size); +void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size); } // namespace DB::Compression From a8d7de5574c59d3a9ca2e4b35e0a42558c38182b Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Thu, 9 May 2024 13:56:43 +0800 Subject: [PATCH 03/27] ut Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/tests/gtest_codec_compression.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index 76781a3ac10..f1381484c31 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -534,7 +534,7 @@ std::vector generatePyramidOfSequences( const auto IntegerCodecsToTest = ::testing::Values( CompressionMethodByte::Lightweight, CompressionMethodByte::DeltaFOR, - CompressionMethodByte::FOR, + // CompressionMethodByte::FOR, // disable FOR codec for now, since there are too many unit tests. CompressionMethodByte::RLE #if USE_QPL , From 86bab117212d500b34c2222870abe31d91313ec0 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Thu, 9 May 2024 14:44:24 +0800 Subject: [PATCH 04/27] init template Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/EncodingUtil.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index db24b7fa68b..b947bf6dd84 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -65,6 +65,10 @@ template void ApplyFrameOfReference(UInt8 *, UInt8, UInt32); template void ApplyFrameOfReference(UInt16 *, UInt16, UInt32); template void ApplyFrameOfReference(UInt32 *, UInt32, UInt32); template void ApplyFrameOfReference(UInt64 *, UInt64, UInt32); +template void ApplyFrameOfReference(Int8 *, Int8, UInt32); +template void ApplyFrameOfReference(Int16 *, Int16, UInt32); +template void ApplyFrameOfReference(Int32 *, Int32, UInt32); +template void ApplyFrameOfReference(Int64 *, Int64, UInt32); template void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count) From bdd4e1f6eca71ebcce0f51237deec839e25fb347 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Fri, 10 May 2024 11:49:47 +0800 Subject: [PATCH 05/27] optimize analyze Signed-off-by: Lloyd-Pottiger --- .../CompressionCodecIntegerLightweight.cpp | 126 ++++++++++-------- .../CompressionCodecIntegerLightweight.h | 11 +- 2 files changed, 83 insertions(+), 54 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp index c0ebe0bdb69..27ca11b2cf5 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp @@ -49,27 +49,16 @@ UInt8 CompressionCodecIntegerLightweight::getMethodByte() const UInt32 CompressionCodecIntegerLightweight::getMaxCompressedDataSize(UInt32 uncompressed_size) const { // 1 byte for bytes_size, 1 byte for mode, and the rest for compressed data - return 1 + 1 + uncompressed_size; + return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size); } template size_t CompressionCodecIntegerLightweight::compressDataForType(const char * source, UInt32 source_size, char * dest) const { - if (source_size % sizeof(T) != 0) - throw Exception( - ErrorCodes::CANNOT_COMPRESS, - "Cannot compress with lightweight codec, data size {} is not aligned to {}", - source_size, - sizeof(T)); - // Load values const size_t count = source_size / sizeof(T); - std::vector values(count); - for (size_t i = 0; i < count; ++i) - { - values[i] = unalignedLoad(source + i * sizeof(T)); - } + std::span values(reinterpret_cast(source), count); // Analyze State state; @@ -99,7 +88,7 @@ size_t CompressionCodecIntegerLightweight::compressDataForType(const char * sour case Mode::FOR: { FORState for_state = std::get<2>(state); - compressed_size += Compression::FOREncoding(values, for_state.min_value, for_state.bit_width, dest); + compressed_size += Compression::FOREncoding(for_state.values, for_state.min_value, for_state.bit_width, dest); break; } case Mode::DELTA_FOR: @@ -191,6 +180,12 @@ void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompre lw_compressed_size += compressed_size; ++lw_counter; } + if (mode == Mode::CONSTANT_DELTA) + ++constant_delta_counter; + if (mode == Mode::DELTA_FOR) + ++delta_for_counter; + if (mode == Mode::RLE) + ++rle_counter; } bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const @@ -204,78 +199,103 @@ bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const return true; } -template -void CompressionCodecIntegerLightweight::CompressContext::analyze(std::vector & values, State & state) +bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeDelta() const { - if (!needAnalyze()) - return; + return lw_counter <= 5 || constant_delta_counter != 0 || delta_for_counter != 0; +} +bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeRLE() const +{ + return lw_counter <= 5 || rle_counter != 0; +} + +template +void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span & values, State & state) +{ if (values.empty()) { mode = Mode::Invalid; return; } + if (!needAnalyze()) + return; + // Check CONSTANT - std::vector> rle; - rle.reserve(values.size()); - rle.emplace_back(values[0], 1); - for (size_t i = 1; i < values.size(); ++i) - { - if (values[i] != values[i - 1] || rle.back().second == std::numeric_limits::max()) - rle.emplace_back(values[i], 1); - else - ++rle.back().second; - } - T min_value = *std::min_element(values.cbegin(), values.cend()); - T max_value = *std::max_element(values.cbegin(), values.cend()); - if (rle.size() == 1) + T min_value = *std::min_element(values.begin(), values.end()); + T max_value = *std::max_element(values.begin(), values.end()); + if (min_value == max_value) { - state = rle[0].first; + state = min_value; mode = Mode::CONSTANT; return; } - // Check CONSTANT_DELTA using TS = std::make_signed_t; std::vector deltas; - deltas.reserve(values.size()); - deltas.push_back(values[0]); - for (size_t i = 1; i < values.size(); ++i) + UInt8 delta_for_width = sizeof(T) * 8; + size_t delta_for_size = std::numeric_limits::max(); + TS min_delta = std::numeric_limits::min(); + if (needAnalyzeDelta()) { - deltas.push_back(values[i] - values[i - 1]); + // Check CONSTANT_DELTA + deltas.reserve(values.size()); + deltas.push_back(values[0]); + for (size_t i = 1; i < values.size(); ++i) + { + deltas.push_back(values[i] - values[i - 1]); + } + min_delta = *std::min_element(deltas.cbegin(), deltas.cend()); + if (min_delta == *std::max_element(deltas.cbegin(), deltas.cend())) + { + state = static_cast(min_delta); + mode = Mode::CONSTANT_DELTA; + return; + } + + // DELTA_FOR + delta_for_width = Compression::FOREncodingWidth(deltas, min_delta); + // additional T bytes for min_delta, and 1 byte for width + delta_for_size + = BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8); } - TS min_delta = *std::min_element(deltas.cbegin(), deltas.cend()); - TS max_delta = *std::max_element(deltas.cbegin(), deltas.cend()); - if (min_delta == max_delta) + + // RLE + std::vector> rle; + if (needAnalyzeRLE()) { - state = static_cast(min_delta); - mode = Mode::CONSTANT_DELTA; - return; + rle.reserve(values.size()); + rle.emplace_back(values[0], 1); + for (size_t i = 1; i < values.size(); ++i) + { + if (values[i] != values[i - 1] || rle.back().second == std::numeric_limits::max()) + rle.emplace_back(values[i], 1); + else + ++rle.back().second; + } } - UInt8 delta_for_width = Compression::FOREncodingWidth(deltas, min_delta); - // additional T bytes for min_delta, and 1 byte for width - size_t delta_for_size - = BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8); UInt8 for_width = BitpackingPrimitives::minimumBitWidth(max_value - min_value); // additional T bytes for min_value, and 1 byte for width size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + sizeof(T) + sizeof(UInt8); - size_t origin_size = values.size() * sizeof(T); + // Assume that the compression ratio of LZ4 is 3.0 + // The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4 + size_t estimate_lz_size = values.size() * sizeof(T) / 3; size_t rle_size = Compression::RLEPairsSize(rle); - if (rle_size < delta_for_size && rle_size < for_size && rle_size < origin_size) + if (rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size) { state = std::move(rle); mode = Mode::RLE; } - else if (for_size < delta_for_size && for_size < origin_size) + else if (for_size < delta_for_size && for_size < estimate_lz_size) { - state = FORState{min_value, for_width}; + std::vector values_copy(values.begin(), values.end()); + state = FORState{std::move(values_copy), min_value, for_width}; mode = Mode::FOR; } - else if (delta_for_size < origin_size) + else if (delta_for_size < estimate_lz_size) { - state = DeltaFORState{deltas, min_delta, delta_for_width}; + state = DeltaFORState{std::move(deltas), min_delta, delta_for_width}; mode = Mode::DELTA_FOR; } else diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h index bbc7f8a7191..f2760bcaf05 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h @@ -16,6 +16,9 @@ #include +#include + + namespace DB { @@ -58,6 +61,7 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec template struct FORState { + std::vector values; T min_value; UInt8 bit_width; }; @@ -81,9 +85,11 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec CompressContext() = default; bool needAnalyze() const; + bool needAnalyzeDelta() const; + bool needAnalyzeRLE() const; template - void analyze(std::vector & values, State & state); + void analyze(std::span & values, State & state); void update(size_t uncompressed_size, size_t compressed_size); @@ -96,6 +102,9 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec size_t lz4_uncompressed_size = 0; size_t lz4_compressed_size = 0; size_t lz4_counter = 0; + size_t constant_delta_counter = 0; + size_t delta_for_counter = 0; + size_t rle_counter = 0; }; template From 40d39a2a9f16972f0b9c49b02a425d614e4a906b Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Thu, 9 May 2024 11:39:21 +0800 Subject: [PATCH 06/27] optimize & rename Signed-off-by: Lloyd-Pottiger --- .../CompressionCodecIntegerLightweight.cpp | 50 ++++++++++++++----- .../CompressionCodecIntegerLightweight.h | 13 +++-- ...cRLE.cpp => CompressionCodecRunLength.cpp} | 42 +++++++++------- ...CodecRLE.h => CompressionCodecRunLength.h} | 4 +- dbms/src/IO/Compression/CompressionFactory.h | 6 +-- dbms/src/IO/Compression/CompressionInfo.h | 2 +- dbms/src/IO/Compression/CompressionSettings.h | 2 +- dbms/src/IO/Compression/EncodingUtil.h | 41 ++++++++------- .../tests/gtest_codec_compression.cpp | 2 +- .../Storages/DeltaMerge/File/DMFileWriter.h | 21 ++++++-- 10 files changed, 119 insertions(+), 64 deletions(-) rename dbms/src/IO/Compression/{CompressionCodecRLE.cpp => CompressionCodecRunLength.cpp} (67%) rename dbms/src/IO/Compression/{CompressionCodecRLE.h => CompressionCodecRunLength.h} (91%) diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp index 27ca11b2cf5..08b87bad5b8 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp @@ -52,6 +52,12 @@ UInt32 CompressionCodecIntegerLightweight::getMaxCompressedDataSize(UInt32 uncom return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size); } +CompressionCodecIntegerLightweight::~CompressionCodecIntegerLightweight() +{ + if (ctx.isCompression()) + LOG_INFO(Logger::get(), "lightweight codec: {}", ctx.toDebugString()); +} + template size_t CompressionCodecIntegerLightweight::compressDataForType(const char * source, UInt32 source_size, char * dest) const @@ -80,9 +86,9 @@ size_t CompressionCodecIntegerLightweight::compressDataForType(const char * sour compressed_size += Compression::ConstantDeltaEncoding(values[0], std::get<0>(state), dest); break; } - case Mode::RLE: + case Mode::RunLength: { - compressed_size += Compression::RLEEncoding(std::get<1>(state), dest); + compressed_size += Compression::RunLengthEncoding(std::get<1>(state), dest); break; } case Mode::FOR: @@ -145,8 +151,8 @@ void CompressionCodecIntegerLightweight::decompressDataForType( case Mode::CONSTANT_DELTA: Compression::ConstantDeltaDecoding(source, source_size, dest, output_size); break; - case Mode::RLE: - Compression::RLEDecoding(source, source_size, dest, output_size); + case Mode::RunLength: + Compression::RunLengthDecoding(source, source_size, dest, output_size); break; case Mode::FOR: Compression::FORDecoding(source, source_size, dest, output_size); @@ -166,6 +172,21 @@ void CompressionCodecIntegerLightweight::decompressDataForType( } } +String CompressionCodecIntegerLightweight::CompressContext::toDebugString() const +{ + return fmt::format( + "lz4: {}, lightweight: {}, constant_delta: {}, delta_for: {}, rle: {}, lz4 {} -> {}, lightweight {} -> {}", + lz4_counter, + lw_counter, + constant_delta_counter, + delta_for_counter, + rle_counter, + lz4_uncompressed_size, + lz4_compressed_size, + lw_uncompressed_size, + lw_compressed_size); +} + void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompressed_size, size_t compressed_size) { if (mode == Mode::LZ4) @@ -184,7 +205,7 @@ void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompre ++constant_delta_counter; if (mode == Mode::DELTA_FOR) ++delta_for_counter; - if (mode == Mode::RLE) + if (mode == Mode::RunLength) ++rle_counter; } @@ -204,7 +225,7 @@ bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeDelta() con return lw_counter <= 5 || constant_delta_counter != 0 || delta_for_counter != 0; } -bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeRLE() const +bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeRunLength() const { return lw_counter <= 5 || rle_counter != 0; } @@ -219,7 +240,10 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span> rle; - if (needAnalyzeRLE()) + // RunLength + Compression::RunLengthPairs rle; + if (needAnalyzeRunLength()) { rle.reserve(values.size()); rle.emplace_back(values[0], 1); @@ -281,11 +305,11 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span::max() : Compression::RunLengthPairsSize(rle); + if (needAnalyzeRunLength() && rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size) { state = std::move(rle); - mode = Mode::RLE; + mode = Mode::RunLength; } else if (for_size < delta_for_size && for_size < estimate_lz_size) { @@ -293,7 +317,7 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span{std::move(values_copy), min_value, for_width}; mode = Mode::FOR; } - else if (delta_for_size < estimate_lz_size) + else if (needAnalyzeDelta() && delta_for_size < estimate_lz_size) { state = DeltaFORState{std::move(deltas), min_delta, delta_for_width}; mode = Mode::DELTA_FOR; diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h index f2760bcaf05..76b7db18599 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h @@ -29,6 +29,8 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec UInt8 getMethodByte() const override; + ~CompressionCodecIntegerLightweight() override; + protected: UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) @@ -45,7 +47,7 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec Invalid = 0, CONSTANT = 1, // all values are the same CONSTANT_DELTA = 2, // the difference between two adjacent values is the same - RLE = 3, // run-length encoding + RunLength = 3, // run-length encoding FOR = 4, // Frame of Reference encoding DELTA_FOR = 5, // delta encoding and then FOR encoding LZ4 = 6, // the above modes are not suitable, use LZ4 instead @@ -56,7 +58,7 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec using ConstantState = T; template - using RLEState = std::vector>; + using RunLengthState = std::vector>; template struct FORState @@ -77,7 +79,7 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec // State is a union of different states for different modes template - using State = std::variant, RLEState, FORState, DeltaFORState>; + using State = std::variant, RunLengthState, FORState, DeltaFORState>; class CompressContext { @@ -86,13 +88,16 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec bool needAnalyze() const; bool needAnalyzeDelta() const; - bool needAnalyzeRLE() const; + bool needAnalyzeRunLength() const; template void analyze(std::span & values, State & state); void update(size_t uncompressed_size, size_t compressed_size); + String toDebugString() const; + bool isCompression() const { return lz4_counter > 0 || lw_counter > 0; } + Mode mode = Mode::LZ4; private: diff --git a/dbms/src/IO/Compression/CompressionCodecRLE.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp similarity index 67% rename from dbms/src/IO/Compression/CompressionCodecRLE.cpp rename to dbms/src/IO/Compression/CompressionCodecRunLength.cpp index 27ddd53c4a7..5ca5fb0c4b5 100644 --- a/dbms/src/IO/Compression/CompressionCodecRLE.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -13,7 +13,7 @@ // limitations under the License. #include -#include +#include #include #include #include @@ -29,16 +29,16 @@ extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; } // namespace ErrorCodes -CompressionCodecRLE::CompressionCodecRLE(UInt8 bytes_size_) +CompressionCodecRunLength::CompressionCodecRunLength(UInt8 bytes_size_) : bytes_size(bytes_size_) {} -UInt8 CompressionCodecRLE::getMethodByte() const +UInt8 CompressionCodecRunLength::getMethodByte() const { - return static_cast(CompressionMethodByte::RLE); + return static_cast(CompressionMethodByte::RunLength); } -UInt32 CompressionCodecRLE::getMaxCompressedDataSize(UInt32 uncompressed_size) const +UInt32 CompressionCodecRunLength::getMaxCompressedDataSize(UInt32 uncompressed_size) const { // If the encoded data is larger than the original data, we will store the original data // Additional byte is used to store the size of the data type @@ -53,7 +53,7 @@ template UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) { const char * source_end = source + source_size; - DB::Compression::RLEPairs rle_vec; + DB::Compression::RunLengthPairs rle_vec; rle_vec.reserve(source_size / sizeof(T)); for (const auto * src = source; src < source_end; src += sizeof(T)) { @@ -65,7 +65,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) ++rle_vec.back().second; } - if (DB::Compression::RLEPairsSize(rle_vec) > source_size) + if (DB::Compression::RunLengthPairsSize(rle_vec) > source_size) { dest[0] = JUST_COPY_CODE; memcpy(&dest[1], source, source_size); @@ -74,12 +74,12 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) dest[0] = sizeof(T); dest += 1; - return 1 + DB::Compression::RLEEncoding(rle_vec, dest); + return 1 + DB::Compression::RunLengthEncoding(rle_vec, dest); } } // namespace -UInt32 CompressionCodecRLE::doCompressData(const char * source, UInt32 source_size, char * dest) const +UInt32 CompressionCodecRunLength::doCompressData(const char * source, UInt32 source_size, char * dest) const { if unlikely (source_size % bytes_size != 0) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); @@ -94,18 +94,20 @@ UInt32 CompressionCodecRLE::doCompressData(const char * source, UInt32 source_si case 8: return compressDataForType(source, source_size, dest); default: - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress RLE-encoded data. Unsupported bytes size"); + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress RunLength-encoded data. Unsupported bytes size"); } } -void CompressionCodecRLE::doDecompressData( +void CompressionCodecRunLength::doDecompressData( const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const { if (source_size < 1) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress RLE-encoded data. File has wrong header"); + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress RunLength-encoded data. File has wrong header"); if (uncompressed_size == 0) return; @@ -114,7 +116,9 @@ void CompressionCodecRLE::doDecompressData( if (bytes_size == JUST_COPY_CODE) { if (source_size - 1 < uncompressed_size) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress RLE-encoded data. File has wrong header"); + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress RunLength-encoded data. File has wrong header"); memcpy(dest, &source[1], uncompressed_size); return; @@ -130,19 +134,21 @@ void CompressionCodecRLE::doDecompressData( switch (bytes_size) { case 1: - DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 2: - DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 4: - DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 8: - DB::Compression::RLEDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; default: - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress RLE-encoded data. Unsupported bytes size"); + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress RunLength-encoded data. Unsupported bytes size"); } } diff --git a/dbms/src/IO/Compression/CompressionCodecRLE.h b/dbms/src/IO/Compression/CompressionCodecRunLength.h similarity index 91% rename from dbms/src/IO/Compression/CompressionCodecRLE.h rename to dbms/src/IO/Compression/CompressionCodecRunLength.h index b114ee13515..c3d38090346 100644 --- a/dbms/src/IO/Compression/CompressionCodecRLE.h +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.h @@ -19,10 +19,10 @@ namespace DB { -class CompressionCodecRLE : public ICompressionCodec +class CompressionCodecRunLength : public ICompressionCodec { public: - explicit CompressionCodecRLE(UInt8 bytes_size_); + explicit CompressionCodecRunLength(UInt8 bytes_size_); UInt8 getMethodByte() const override; diff --git a/dbms/src/IO/Compression/CompressionFactory.h b/dbms/src/IO/Compression/CompressionFactory.h index 8e1646f5550..06a458a5144 100644 --- a/dbms/src/IO/Compression/CompressionFactory.h +++ b/dbms/src/IO/Compression/CompressionFactory.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -63,8 +63,8 @@ class CompressionFactory { case CompressionMethodByte::DeltaFOR: return std::make_unique(setting.type_bytes_size); - case CompressionMethodByte::RLE: - return std::make_unique(setting.type_bytes_size); + case CompressionMethodByte::RunLength: + return std::make_unique(setting.type_bytes_size); case CompressionMethodByte::FOR: return std::make_unique(setting.type_bytes_size); case CompressionMethodByte::NONE: diff --git a/dbms/src/IO/Compression/CompressionInfo.h b/dbms/src/IO/Compression/CompressionInfo.h index fc9866635c9..c8b59c974b7 100644 --- a/dbms/src/IO/Compression/CompressionInfo.h +++ b/dbms/src/IO/Compression/CompressionInfo.h @@ -58,7 +58,7 @@ enum class CompressionMethodByte : UInt8 ZSTD = 0x90, Multiple = 0x91, DeltaFOR = 0x92, - RLE = 0x93, + RunLength = 0x93, FOR = 0x94, Lightweight = 0x95, // COL_END is not a compreesion method, but a flag of column end used in compact file. diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index 2f90eee5019..54bf73714da 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -40,7 +40,7 @@ const std::unordered_map method_map = {CompressionMethodByte::QPL, CompressionMethod::QPL}, {CompressionMethodByte::NONE, CompressionMethod::NONE}, {CompressionMethodByte::DeltaFOR, CompressionMethod::NONE}, - {CompressionMethodByte::RLE, CompressionMethod::NONE}, + {CompressionMethodByte::RunLength, CompressionMethod::NONE}, {CompressionMethodByte::FOR, CompressionMethod::NONE}, {CompressionMethodByte::Lightweight, CompressionMethod::Lightweight}, }; diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index e07a8555304..da3ac50f56f 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -91,20 +91,20 @@ void ConstantDeltaDecoding(const char * src, UInt32 source_size, char * dest, UI /// Run-length encoding template -using RLEPair = std::pair; +using RunLengthPair = std::pair; template -using RLEPairs = std::vector>; +using RunLengthPairs = std::vector>; template -static constexpr size_t RLEPairLength = sizeof(T) + sizeof(UInt8); +static constexpr size_t RunLengthPairLength = sizeof(T) + sizeof(UInt8); template -size_t RLEPairsSize(const RLEPairs & rle) +size_t RunLengthPairsSize(const RunLengthPairs & rle) { - return rle.size() * RLEPairLength; + return rle.size() * RunLengthPairLength; } template -size_t RLEEncoding(const RLEPairs & rle, char * dest) +size_t RunLengthEncoding(const RunLengthPairs & rle, char * dest) { for (const auto & [value, count] : rle) { @@ -113,32 +113,39 @@ size_t RLEEncoding(const RLEPairs & rle, char * dest) unalignedStore(dest, count); dest += sizeof(UInt8); } - return rle.size() * RLEPairLength; + return rle.size() * RunLengthPairLength; } template -void RLEDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void RunLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - if unlikely (source_size % RLEPairLength != 0) + if unlikely (source_size % RunLengthPairLength != 0) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, - "Cannot use RLE decoding, data size {} is not aligned to {}", + "Cannot use RunLength decoding, data size {} is not aligned to {}", source_size, - RLEPairLength); + RunLengthPairLength); const char * dest_end = dest + dest_size; - for (UInt32 i = 0; i < source_size / RLEPairLength; ++i) + for (UInt32 i = 0; i < source_size / RunLengthPairLength; ++i) { T value = unalignedLoad(src); src += sizeof(T); auto count = unalignedLoad(src); src += sizeof(UInt8); - if (dest + count * sizeof(T) > dest_end) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot use RLE decoding, data is too large"); - for (UInt8 j = 0; j < count; ++j) + if (unlikely(dest + count * sizeof(T) > dest_end)) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot use RunLength decoding, data is too large"); + if constexpr (std::is_same_v || std::is_same_v) { - unalignedStore(dest, value); - dest += sizeof(T); + memset(dest, value, count); + } + else + { + for (UInt8 j = 0; j < count; ++j) + { + unalignedStore(dest, value); + dest += sizeof(T); + } } } } diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index f1381484c31..0c458dfdaeb 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -535,7 +535,7 @@ const auto IntegerCodecsToTest = ::testing::Values( CompressionMethodByte::Lightweight, CompressionMethodByte::DeltaFOR, // CompressionMethodByte::FOR, // disable FOR codec for now, since there are too many unit tests. - CompressionMethodByte::RLE + CompressionMethodByte::RunLength #if USE_QPL , CompressionMethodByte::QPL diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h index d85185bc729..f0ee9bf8606 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h @@ -65,12 +65,25 @@ class DMFileWriter /*flags*/ -1, /*mode*/ 0666, max_compress_block_size)) - , compressed_buf(CompressedWriteBuffer<>::build( - *plain_file, - compression_settings, - !dmfile->getConfiguration().has_value())) , minmaxes(do_index ? std::make_shared(*type) : nullptr) { + // TODO: better, now only for test + if (type->isInteger()) + { + assert(compression_settings.settings.size() == 1); + CompressionSettings settings(CompressionMethod::Lightweight); + auto & setting = settings.settings[0]; + setting.type_bytes_size = type->getSizeOfValueInMemory(); + compressed_buf = CompressedWriteBuffer<>::build(*plain_file, settings, !dmfile->getConfiguration()); + } + else + { + compressed_buf = CompressedWriteBuffer<>::build( // + *plain_file, + compression_settings, + !dmfile->getConfiguration()); + } + if (!dmfile->useMetaV2()) { // will not used in DMFileFormat::V3, could be removed when v3 is default From e79352f233aec5bf85251a6e909aa5c75ab89191 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 3 Jun 2024 15:05:29 +0800 Subject: [PATCH 07/27] rename Signed-off-by: Lloyd-Pottiger --- .../Compression/CompressionCodecDeltaFOR.cpp | 18 +++--- .../CompressionCodecIntegerLightweight.cpp | 16 ++--- .../Compression/CompressionCodecRunLength.cpp | 12 ++-- dbms/src/IO/Compression/EncodingUtil.cpp | 62 +++++++++---------- dbms/src/IO/Compression/EncodingUtil.h | 39 ++++++------ 5 files changed, 74 insertions(+), 73 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index 73880a3424f..64df56856e5 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -57,7 +57,7 @@ template UInt32 compressData(const char * source, UInt32 source_size, char * dest) { const auto count = source_size / sizeof(T); - DB::Compression::DeltaEncoding(reinterpret_cast(source), count, reinterpret_cast(dest)); + DB::Compression::deltaEncoding(reinterpret_cast(source), count, reinterpret_cast(dest)); // Cast deltas to signed type to better compress negative values. // For example, if we have a sequence of UInt8 values [3, 2, 1, 0], the deltas will be [3, -1, -1, -1] // If we compress them as UInt8, we will get [3, 255, 255, 255], which is not optimal. @@ -114,16 +114,16 @@ void CompressionCodecDeltaFOR::doDecompressData( switch (bytes_size) { case 1: - DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 2: - DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 4: - DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; case 8: - DB::Compression::DeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); + DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: throw Exception( @@ -158,16 +158,16 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( switch (bytes_size) { case 1: - DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 2: - DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 4: - DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; case 8: - DB::Compression::OrdinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); break; default: throw Exception( diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp index 08b87bad5b8..f2962fcbdbc 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp @@ -78,17 +78,17 @@ size_t CompressionCodecIntegerLightweight::compressDataForType(const char * sour { case Mode::CONSTANT: { - compressed_size += Compression::ConstantEncoding(std::get<0>(state), dest); + compressed_size += Compression::constantEncoding(std::get<0>(state), dest); break; } case Mode::CONSTANT_DELTA: { - compressed_size += Compression::ConstantDeltaEncoding(values[0], std::get<0>(state), dest); + compressed_size += Compression::constantDeltaEncoding(values[0], std::get<0>(state), dest); break; } case Mode::RunLength: { - compressed_size += Compression::RunLengthEncoding(std::get<1>(state), dest); + compressed_size += Compression::runLengthEncoding(std::get<1>(state), dest); break; } case Mode::FOR: @@ -146,19 +146,19 @@ void CompressionCodecIntegerLightweight::decompressDataForType( switch (mode) { case Mode::CONSTANT: - Compression::ConstantDecoding(source, source_size, dest, output_size); + Compression::constantDecoding(source, source_size, dest, output_size); break; case Mode::CONSTANT_DELTA: - Compression::ConstantDeltaDecoding(source, source_size, dest, output_size); + Compression::constantDeltaDecoding(source, source_size, dest, output_size); break; case Mode::RunLength: - Compression::RunLengthDecoding(source, source_size, dest, output_size); + Compression::runLengthDecoding(source, source_size, dest, output_size); break; case Mode::FOR: Compression::FORDecoding(source, source_size, dest, output_size); break; case Mode::DELTA_FOR: - Compression::DeltaFORDecoding(source, source_size, dest, output_size); + Compression::deltaFORDecoding(source, source_size, dest, output_size); break; case Mode::LZ4: if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) @@ -305,7 +305,7 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span::max() : Compression::RunLengthPairsSize(rle); + size_t rle_size = rle.empty() ? std::numeric_limits::max() : Compression::runLengthPairsSize(rle); if (needAnalyzeRunLength() && rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size) { state = std::move(rle); diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index 5ca5fb0c4b5..ed438ea7ce7 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -65,7 +65,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) ++rle_vec.back().second; } - if (DB::Compression::RunLengthPairsSize(rle_vec) > source_size) + if (DB::Compression::runLengthPairsSize(rle_vec) >= source_size) { dest[0] = JUST_COPY_CODE; memcpy(&dest[1], source, source_size); @@ -74,7 +74,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) dest[0] = sizeof(T); dest += 1; - return 1 + DB::Compression::RunLengthEncoding(rle_vec, dest); + return 1 + DB::Compression::runLengthEncoding(rle_vec, dest); } } // namespace @@ -134,16 +134,16 @@ void CompressionCodecRunLength::doDecompressData( switch (bytes_size) { case 1: - DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 2: - DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 4: - DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; case 8: - DB::Compression::RunLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); + DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; default: throw Exception( diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index b947bf6dd84..531fbe60e24 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "EncodingUtil.h" +#include #if defined(__AVX2__) #include @@ -22,7 +22,7 @@ namespace DB::Compression { template -void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count) +void applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count) { if (frame_of_reference == 0) return; @@ -61,17 +61,17 @@ void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count) } } -template void ApplyFrameOfReference(UInt8 *, UInt8, UInt32); -template void ApplyFrameOfReference(UInt16 *, UInt16, UInt32); -template void ApplyFrameOfReference(UInt32 *, UInt32, UInt32); -template void ApplyFrameOfReference(UInt64 *, UInt64, UInt32); -template void ApplyFrameOfReference(Int8 *, Int8, UInt32); -template void ApplyFrameOfReference(Int16 *, Int16, UInt32); -template void ApplyFrameOfReference(Int32 *, Int32, UInt32); -template void ApplyFrameOfReference(Int64 *, Int64, UInt32); +template void applyFrameOfReference(UInt8 *, UInt8, UInt32); +template void applyFrameOfReference(UInt16 *, UInt16, UInt32); +template void applyFrameOfReference(UInt32 *, UInt32, UInt32); +template void applyFrameOfReference(UInt64 *, UInt64, UInt32); +template void applyFrameOfReference(Int8 *, Int8, UInt32); +template void applyFrameOfReference(Int16 *, Int16, UInt32); +template void applyFrameOfReference(Int32 *, Int32, UInt32); +template void applyFrameOfReference(Int64 *, Int64, UInt32); template -void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count) +void subtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count) { if (frame_of_reference == 0) return; @@ -110,14 +110,14 @@ void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count) } } -template void SubtractFrameOfReference(Int8 *, Int8, UInt32); -template void SubtractFrameOfReference(Int16 *, Int16, UInt32); -template void SubtractFrameOfReference(Int32 *, Int32, UInt32); -template void SubtractFrameOfReference(Int64 *, Int64, UInt32); -template void SubtractFrameOfReference(UInt8 *, UInt8, UInt32); -template void SubtractFrameOfReference(UInt16 *, UInt16, UInt32); -template void SubtractFrameOfReference(UInt32 *, UInt32, UInt32); -template void SubtractFrameOfReference(UInt64 *, UInt64, UInt32); +template void subtractFrameOfReference(Int8 *, Int8, UInt32); +template void subtractFrameOfReference(Int16 *, Int16, UInt32); +template void subtractFrameOfReference(Int32 *, Int32, UInt32); +template void subtractFrameOfReference(Int64 *, Int64, UInt32); +template void subtractFrameOfReference(UInt8 *, UInt8, UInt32); +template void subtractFrameOfReference(UInt16 *, UInt16, UInt32); +template void subtractFrameOfReference(UInt32 *, UInt32, UInt32); +template void subtractFrameOfReference(UInt64 *, UInt64, UInt32); template UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference) @@ -128,7 +128,7 @@ UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference) // For example, we have a sequence of Int8 values [-128, 1, 127], after subtracting frame of reference -128, the values are [0, -127, -1]. // The minimum bit width required to store the values is 8 rather than the width of `max_value - min_value = -1`. // So we need to calculate the minimum bit width of the values after subtracting frame of reference. - SubtractFrameOfReference(values.data(), frame_of_reference, values.size()); + subtractFrameOfReference(values.data(), frame_of_reference, values.size()); T max_value = *std::max_element(values.cbegin(), values.cend()); T min_value = *std::min_element(values.cbegin(), values.cend()); return BitpackingPrimitives::minimumBitWidth(min_value, max_value); @@ -150,7 +150,7 @@ template UInt8 FOREncodingWidth(std::vector &, UInt32); template UInt8 FOREncodingWidth(std::vector &, UInt64); template -void DeltaDecoding(const char * source, UInt32 source_size, char * dest) +void deltaDecoding(const char * source, UInt32 source_size, char * dest) { ordinaryDeltaDecoding(source, source_size, dest); } @@ -159,7 +159,7 @@ void DeltaDecoding(const char * source, UInt32 source_size, char * dest) // Note: using SIMD to rewrite compress does not improve performance. template <> -void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) +void deltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) { const auto * source = reinterpret_cast(raw_source); auto source_size = raw_source_size / sizeof(UInt32); @@ -183,7 +183,7 @@ void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_sour } template <> -void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) +void deltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) { const auto * source = reinterpret_cast(raw_source); auto source_size = raw_source_size / sizeof(UInt64); @@ -219,14 +219,14 @@ void DeltaDecoding(const char * __restrict__ raw_source, UInt32 raw_sour #endif template -void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { static_assert(std::is_integral::value, "Integral required."); - OrdinaryDeltaFORDecoding(src, source_size, dest, dest_size); + ordinaryDeltaFORDecoding(src, source_size, dest, dest_size); } template <> -void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { const auto count = dest_size / sizeof(UInt32); auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); @@ -235,11 +235,11 @@ void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); FORDecoding(src, source_size, tmp_buffer, required_size); - DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); + deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } template <> -void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { const auto count = dest_size / sizeof(UInt64); const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); @@ -248,10 +248,10 @@ void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); FORDecoding(src, source_size, tmp_buffer, required_size); - DeltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); + deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } -template void DeltaFORDecoding(const char *, UInt32, char *, UInt32); -template void DeltaFORDecoding(const char *, UInt32, char *, UInt32); +template void deltaFORDecoding(const char *, UInt32, char *, UInt32); +template void deltaFORDecoding(const char *, UInt32, char *, UInt32); } // namespace DB::Compression diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index da3ac50f56f..2632e80b65a 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -35,16 +35,16 @@ namespace DB::Compression /// Constant encoding template -size_t ConstantEncoding(T constant, char * dest) +size_t constantEncoding(T constant, char * dest) { unalignedStore(dest, constant); return sizeof(T); } template -void ConstantDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void constantDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - if (source_size < sizeof(T)) + if (unlikely(source_size < sizeof(T))) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot use Constant decoding, data size {} is too small", @@ -61,7 +61,7 @@ void ConstantDecoding(const char * src, UInt32 source_size, char * dest, UInt32 /// Constant delta encoding template -size_t ConstantDeltaEncoding(T first_value, T constant_delta, char * dest) +size_t constantDeltaEncoding(T first_value, T constant_delta, char * dest) { unalignedStore(dest, first_value); dest += sizeof(T); @@ -70,9 +70,9 @@ size_t ConstantDeltaEncoding(T first_value, T constant_delta, char * dest) } template -void ConstantDeltaDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void constantDeltaDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - if (source_size < sizeof(T) + sizeof(T)) + if (unlikely(source_size < sizeof(T) + sizeof(T))) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot use ConstantDelta decoding, data size {} is too small", @@ -98,13 +98,13 @@ template static constexpr size_t RunLengthPairLength = sizeof(T) + sizeof(UInt8); template -size_t RunLengthPairsSize(const RunLengthPairs & rle) +size_t runLengthPairsSize(const RunLengthPairs & rle) { return rle.size() * RunLengthPairLength; } template -size_t RunLengthEncoding(const RunLengthPairs & rle, char * dest) +size_t runLengthEncoding(const RunLengthPairs & rle, char * dest) { for (const auto & [value, count] : rle) { @@ -117,9 +117,9 @@ size_t RunLengthEncoding(const RunLengthPairs & rle, char * dest) } template -void RunLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void runLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - if unlikely (source_size % RunLengthPairLength != 0) + if (unlikely(source_size % RunLengthPairLength != 0)) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot use RunLength decoding, data size {} is not aligned to {}", @@ -138,6 +138,7 @@ void RunLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 if constexpr (std::is_same_v || std::is_same_v) { memset(dest, value, count); + dest += count * sizeof(T); } else { @@ -153,7 +154,7 @@ void RunLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 /// Frame of Reference encoding template -void SubtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count); +void subtractFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference); @@ -163,7 +164,7 @@ size_t FOREncoding(std::vector & values, T frame_of_reference, UInt8 width, c { assert(!values.empty()); if constexpr (!skip_subtract_frame_of_reference) - SubtractFrameOfReference(values.data(), frame_of_reference, values.size()); + subtractFrameOfReference(values.data(), frame_of_reference, values.size()); // store frame of reference unalignedStore(dest, frame_of_reference); dest += sizeof(T); @@ -180,7 +181,7 @@ size_t FOREncoding(std::vector & values, T frame_of_reference, UInt8 width, c } template -void ApplyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); +void applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) @@ -198,7 +199,7 @@ void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_ // Reserve enough space for the temporary buffer. unsigned char tmp_buffer[round_size * sizeof(T)]; BitpackingPrimitives::unPackBuffer(tmp_buffer, reinterpret_cast(src), count, width); - ApplyFrameOfReference(reinterpret_cast(tmp_buffer), frame_of_reference, count); + applyFrameOfReference(reinterpret_cast(tmp_buffer), frame_of_reference, count); memcpy(dest, tmp_buffer, dest_size); return; } @@ -207,13 +208,13 @@ void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_ reinterpret_cast(src), count, width); - ApplyFrameOfReference(reinterpret_cast(dest), frame_of_reference, count); + applyFrameOfReference(reinterpret_cast(dest), frame_of_reference, count); } /// Delta encoding template -void DeltaEncoding(const T * source, UInt32 count, T * dest) +void deltaEncoding(const T * source, UInt32 count, T * dest) { T prev = 0; for (UInt32 i = 0; i < count; ++i) @@ -240,12 +241,12 @@ void ordinaryDeltaDecoding(const char * source, UInt32 source_size, char * dest) } template -void DeltaDecoding(const char * source, UInt32 source_size, char * dest); +void deltaDecoding(const char * source, UInt32 source_size, char * dest); /// Delta + Frame of Reference encoding template -void OrdinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) +void ordinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { using TS = typename std::make_signed_t; FORDecoding(src, source_size, dest, dest_size); @@ -253,6 +254,6 @@ void OrdinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, } template -void DeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size); +void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size); } // namespace DB::Compression From d621355fa83e6958700566955e1a6afd810fa6e3 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:45:50 +0800 Subject: [PATCH 08/27] Update dbms/src/IO/Compression/EncodingUtil.cpp Co-authored-by: jinhelin --- dbms/src/IO/Compression/EncodingUtil.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index 531fbe60e24..aca78ee1784 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -129,9 +129,8 @@ UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference) // The minimum bit width required to store the values is 8 rather than the width of `max_value - min_value = -1`. // So we need to calculate the minimum bit width of the values after subtracting frame of reference. subtractFrameOfReference(values.data(), frame_of_reference, values.size()); - T max_value = *std::max_element(values.cbegin(), values.cend()); - T min_value = *std::min_element(values.cbegin(), values.cend()); - return BitpackingPrimitives::minimumBitWidth(min_value, max_value); + auto [min_value, max_value] = std::minmax_element(values.cbegin(), values.cend()); + return BitpackingPrimitives::minimumBitWidth(*min_value, *max_value); } else { From 91ac0d675fc2a54e6c3d917f15199ee99e5a8fd9 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:17:31 +0800 Subject: [PATCH 09/27] Update dbms/src/IO/Compression/CompressionCodecFOR.cpp Co-authored-by: jinhelin --- dbms/src/IO/Compression/CompressionCodecFOR.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index db3b7511bb0..55f0fadc803 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -53,8 +53,7 @@ template UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 count, char * dest) { assert(count > 0); // doCompressData ensure it - std::vector values(count); - values.assign(source, source + count); + std::vector values(source, source + count); T frame_of_reference = *std::min_element(values.cbegin(), values.cend()); UInt8 width = DB::Compression::FOREncodingWidth(values, frame_of_reference); return DB::Compression::FOREncoding>(values, frame_of_reference, width, dest); From 61fe4bf98293ccf0e86958404c6bcc6b77c79f7b Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Fri, 21 Jun 2024 15:56:42 +0800 Subject: [PATCH 10/27] work with non-integer type Signed-off-by: Lloyd-Pottiger --- .../Compression/CompressionCodecDeltaFOR.cpp | 128 +++--- .../IO/Compression/CompressionCodecDeltaFOR.h | 6 +- .../IO/Compression/CompressionCodecFOR.cpp | 110 +++--- dbms/src/IO/Compression/CompressionCodecFOR.h | 6 +- .../CompressionCodecLightweight.cpp | 126 ++++++ ...weight.h => CompressionCodecLightweight.h} | 40 +- ... CompressionCodecLightweight_Interger.cpp} | 366 ++++++++---------- ...CompressionCodecLightweight_NonInteger.cpp | 53 +++ .../Compression/CompressionCodecRunLength.cpp | 105 +++-- .../Compression/CompressionCodecRunLength.h | 8 +- dbms/src/IO/Compression/CompressionFactory.h | 4 +- dbms/src/IO/Compression/CompressionInfo.h | 11 + dbms/src/IO/Compression/CompressionSettings.h | 2 +- dbms/src/IO/Compression/EncodingUtil.h | 7 + .../tests/gtest_codec_compression.cpp | 4 +- .../Storages/DeltaMerge/File/DMFileWriter.h | 21 +- 16 files changed, 585 insertions(+), 412 deletions(-) create mode 100644 dbms/src/IO/Compression/CompressionCodecLightweight.cpp rename dbms/src/IO/Compression/{CompressionCodecIntegerLightweight.h => CompressionCodecLightweight.h} (68%) rename dbms/src/IO/Compression/{CompressionCodecIntegerLightweight.cpp => CompressionCodecLightweight_Interger.cpp} (63%) create mode 100644 dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index 64df56856e5..099dfd900ba 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -17,10 +17,13 @@ #include #include #include +#include #include #include #include +#include +#include namespace DB { @@ -31,8 +34,8 @@ extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; } // namespace ErrorCodes -CompressionCodecDeltaFOR::CompressionCodecDeltaFOR(UInt8 bytes_size_) - : bytes_size(bytes_size_) +CompressionCodecDeltaFOR::CompressionCodecDeltaFOR(CompressionDataType data_type_) + : data_type(data_type_) {} UInt8 CompressionCodecDeltaFOR::getMethodByte() const @@ -42,12 +45,22 @@ UInt8 CompressionCodecDeltaFOR::getMethodByte() const UInt32 CompressionCodecDeltaFOR::getMaxCompressedDataSize(UInt32 uncompressed_size) const { - /** - *|bytes_of_original_type|frame_of_reference|width(bits) |bitpacked data| - *|1 bytes |bytes_size |sizeof(UInt8)|required size | - */ - const size_t count = uncompressed_size / bytes_size; - return 1 + bytes_size + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(count, bytes_size * 8); + switch (data_type) + { + case CompressionDataType::Int8: + case CompressionDataType::Int16: + case CompressionDataType::Int32: + case CompressionDataType::Int64: + { + // |bytes_of_original_type|frame_of_reference|width(bits) |bitpacked data| + // |1 bytes |bytes_size |sizeof(UInt8)|required size | + auto bytes_size = magic_enum::enum_integer(data_type); + const size_t count = uncompressed_size / bytes_size; + return 1 + bytes_size + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(count, bytes_size * 8); + } + default: + return 1 + LZ4_COMPRESSBOUND(uncompressed_size); + } } namespace @@ -56,35 +69,44 @@ namespace template UInt32 compressData(const char * source, UInt32 source_size, char * dest) { + constexpr auto bytes_size = sizeof(T); + if unlikely (source_size % bytes_size != 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); const auto count = source_size / sizeof(T); DB::Compression::deltaEncoding(reinterpret_cast(source), count, reinterpret_cast(dest)); // Cast deltas to signed type to better compress negative values. // For example, if we have a sequence of UInt8 values [3, 2, 1, 0], the deltas will be [3, -1, -1, -1] // If we compress them as UInt8, we will get [3, 255, 255, 255], which is not optimal. using TS = typename std::make_signed::type; - return DB::CompressionCodecFOR::compressData(reinterpret_cast(dest), count, dest); + return DB::CompressionCodecFOR::compressData(reinterpret_cast(dest), source_size, dest); } } // namespace UInt32 CompressionCodecDeltaFOR::doCompressData(const char * source, UInt32 source_size, char * dest) const { - if unlikely (source_size % bytes_size != 0) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); - dest[0] = bytes_size; - size_t start_pos = 1; - switch (bytes_size) + dest[0] = magic_enum::enum_integer(data_type); + dest += 1; + switch (data_type) { - case 1: - return 1 + compressData(source, source_size, &dest[start_pos]); - case 2: - return 1 + compressData(source, source_size, &dest[start_pos]); - case 4: - return 1 + compressData(source, source_size, &dest[start_pos]); - case 8: - return 1 + compressData(source, source_size, &dest[start_pos]); + case CompressionDataType::Int8: + return 1 + compressData(source, source_size, dest); + case CompressionDataType::Int16: + return 1 + compressData(source, source_size, dest); + case CompressionDataType::Int32: + return 1 + compressData(source, source_size, dest); + case CompressionDataType::Int64: + return 1 + compressData(source, source_size, dest); default: - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress DeltaFor-encoded data. Unsupported bytes size"); + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + return 1 + success; } } @@ -103,32 +125,28 @@ void CompressionCodecDeltaFOR::doDecompressData( return; UInt8 bytes_size = source[0]; - if unlikely (uncompressed_size % bytes_size != 0) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "uncompressed size {} is not aligned to {}", - uncompressed_size, - bytes_size); + auto data_type = magic_enum::enum_cast(bytes_size); + RUNTIME_CHECK(data_type.has_value()); UInt32 source_size_no_header = source_size - 1; - switch (bytes_size) + switch (data_type.value()) { - case 1: + case CompressionDataType::Int8: DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 2: + case CompressionDataType::Int16: DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 4: + case CompressionDataType::Int32: DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 8: + case CompressionDataType::Int64: DB::Compression::deltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress DeltaFor-encoded data. Unsupported bytes size"); + if (unlikely(LZ4_decompress_safe(&source[1], dest, source_size_no_header, uncompressed_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); + break; } } @@ -136,43 +154,39 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( const char * source, UInt32 source_size, char * dest, - UInt32 dest_size) + UInt32 uncompressed_size) { if unlikely (source_size < 2) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress DeltaFor-encoded data. File has wrong header"); - if (dest_size == 0) + if (uncompressed_size == 0) return; UInt8 bytes_size = source[0]; - if unlikely (dest_size % bytes_size != 0) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "uncompressed size {} is not aligned to {}", - dest_size, - bytes_size); + auto data_type = magic_enum::enum_cast(bytes_size); + RUNTIME_CHECK(data_type.has_value()); UInt32 source_size_no_header = source_size - 1; - switch (bytes_size) + switch (data_type.value()) { - case 1: - DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + case CompressionDataType::Int8: + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 2: - DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + case CompressionDataType::Int16: + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 4: - DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + case CompressionDataType::Int32: + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 8: - DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, dest_size); + case CompressionDataType::Int64: + DB::Compression::ordinaryDeltaFORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress DeltaFor-encoded data. Unsupported bytes size"); + if (unlikely(LZ4_decompress_safe(&source[1], dest, source_size_no_header, uncompressed_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); + break; } } diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h index 316f4be72a9..5faf713e864 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h @@ -22,11 +22,11 @@ namespace DB class CompressionCodecDeltaFOR : public ICompressionCodec { public: - explicit CompressionCodecDeltaFOR(UInt8 bytes_size_); + explicit CompressionCodecDeltaFOR(CompressionDataType data_type_); UInt8 getMethodByte() const override; - static void ordinaryDecompress(const char * source, UInt32 source_size, char * dest, UInt32 dest_size); + static void ordinaryDecompress(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size); #ifndef DBMS_PUBLIC_GTEST protected: @@ -42,7 +42,7 @@ class CompressionCodecDeltaFOR : public ICompressionCodec bool isGenericCompression() const override { return false; } private: - const UInt8 bytes_size; + const CompressionDataType data_type; }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index 55f0fadc803..881232b8155 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -16,9 +16,13 @@ #include #include #include +#include #include #include #include +#include + +#include namespace DB @@ -30,8 +34,8 @@ extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; } // namespace ErrorCodes -CompressionCodecFOR::CompressionCodecFOR(UInt8 bytes_size_) - : bytes_size(bytes_size_) +CompressionCodecFOR::CompressionCodecFOR(CompressionDataType data_type_) + : data_type(data_type_) {} UInt8 CompressionCodecFOR::getMethodByte() const @@ -41,18 +45,33 @@ UInt8 CompressionCodecFOR::getMethodByte() const UInt32 CompressionCodecFOR::getMaxCompressedDataSize(UInt32 uncompressed_size) const { - /** - *|bytes_of_original_type|frame_of_reference|width(bits) |bitpacked data| - *|1 bytes |bytes_size |sizeof(UInt8)|required size | - */ - const size_t count = uncompressed_size / bytes_size; - return 1 + bytes_size + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(count, bytes_size * 8); + switch (data_type) + { + case CompressionDataType::Int8: + case CompressionDataType::Int16: + case CompressionDataType::Int32: + case CompressionDataType::Int64: + { + // |bytes_of_original_type|frame_of_reference|width(bits) |bitpacked data| + // |1 bytes |bytes_size |sizeof(UInt8)|required size | + auto bytes_size = magic_enum::enum_integer(data_type); + const size_t count = uncompressed_size / bytes_size; + return 1 + bytes_size + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(count, bytes_size * 8); + } + default: + return 1 + LZ4_COMPRESSBOUND(uncompressed_size); + } } template -UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 count, char * dest) +UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 source_size, char * dest) { - assert(count > 0); // doCompressData ensure it + constexpr size_t bytes_size = sizeof(T); + if unlikely (source_size % bytes_size != 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); + auto count = source_size / bytes_size; + if unlikely (count == 0) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress empty data"); std::vector values(source, source + count); T frame_of_reference = *std::min_element(values.cbegin(), values.cend()); UInt8 width = DB::Compression::FOREncodingWidth(values, frame_of_reference); @@ -61,25 +80,28 @@ UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 count, char * UInt32 CompressionCodecFOR::doCompressData(const char * source, UInt32 source_size, char * dest) const { - if unlikely (source_size % bytes_size != 0) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); - dest[0] = bytes_size; - auto count = source_size / bytes_size; - switch (bytes_size) + dest[0] = magic_enum::enum_integer(data_type); + dest += 1; + switch (data_type) { - case 1: - return 1 + compressData(reinterpret_cast(source), count, &dest[1]); - case 2: - return 1 + compressData(reinterpret_cast(source), count, &dest[1]); - case 4: - return 1 + compressData(reinterpret_cast(source), count, &dest[1]); - case 8: - return 1 + compressData(reinterpret_cast(source), count, &dest[1]); + case CompressionDataType::Int8: + return 1 + compressData(reinterpret_cast(source), source_size, dest); + case CompressionDataType::Int16: + return 1 + compressData(reinterpret_cast(source), source_size, dest); + case CompressionDataType::Int32: + return 1 + compressData(reinterpret_cast(source), source_size, dest); + case CompressionDataType::Int64: + return 1 + compressData(reinterpret_cast(source), source_size, dest); default: - throw Exception( - ErrorCodes::CANNOT_COMPRESS, - "Cannot compress For-encoded data. Unsupported bytes size: {}", - bytes_size); + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + return 1 + success; } } @@ -96,42 +118,36 @@ void CompressionCodecFOR::doDecompressData( return; UInt8 bytes_size = source[0]; - if unlikely (uncompressed_size % bytes_size != 0) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "uncompressed size {} is not aligned to {}", - uncompressed_size, - bytes_size); + auto data_type = magic_enum::enum_cast(bytes_size); + RUNTIME_CHECK(data_type.has_value()); UInt32 source_size_no_header = source_size - 1; - switch (bytes_size) + switch (data_type.value()) { - case 1: + case CompressionDataType::Int8: DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 2: + case CompressionDataType::Int16: DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 4: + case CompressionDataType::Int32: DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; - case 8: + case CompressionDataType::Int64: DB::Compression::FORDecoding(&source[1], source_size_no_header, dest, uncompressed_size); break; default: - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress For-encoded data. Unsupported bytes size: {}", - bytes_size); + if (unlikely(LZ4_decompress_safe(&source[1], dest, source_size_no_header, uncompressed_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); + break; } } - // The following instantiations are used in CompressionCodecDeltaFor.cpp -template UInt32 CompressionCodecFOR::compressData(const Int8 * source, UInt32 count, char * dest); -template UInt32 CompressionCodecFOR::compressData(const Int16 * source, UInt32 count, char * dest); -template UInt32 CompressionCodecFOR::compressData(const Int32 * source, UInt32 count, char * dest); -template UInt32 CompressionCodecFOR::compressData(const Int64 * source, UInt32 count, char * dest); +template UInt32 CompressionCodecFOR::compressData(const Int8 * source, UInt32 source_size, char * dest); +template UInt32 CompressionCodecFOR::compressData(const Int16 * source, UInt32 source_size, char * dest); +template UInt32 CompressionCodecFOR::compressData(const Int32 * source, UInt32 source_size, char * dest); +template UInt32 CompressionCodecFOR::compressData(const Int64 * source, UInt32 source_size, char * dest); } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.h b/dbms/src/IO/Compression/CompressionCodecFOR.h index 75dd8b91734..824c36276cf 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecFOR.h @@ -30,12 +30,12 @@ namespace DB class CompressionCodecFOR : public ICompressionCodec { public: - explicit CompressionCodecFOR(UInt8 bytes_size_); + explicit CompressionCodecFOR(CompressionDataType data_type_); UInt8 getMethodByte() const override; template - static UInt32 compressData(const T * source, UInt32 count, char * dest); + static UInt32 compressData(const T * source, UInt32 source_size, char * dest); #ifndef DBMS_PUBLIC_GTEST protected: @@ -51,7 +51,7 @@ class CompressionCodecFOR : public ICompressionCodec bool isGenericCompression() const override { return false; } private: - const UInt8 bytes_size; + const CompressionDataType data_type; }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp new file mode 100644 index 00000000000..efa0b77309d --- /dev/null +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp @@ -0,0 +1,126 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + + +namespace DB +{ + +// TODO: metrics + +namespace ErrorCodes +{ +extern const int CANNOT_COMPRESS; +extern const int CANNOT_DECOMPRESS; +} // namespace ErrorCodes + +CompressionCodecLightweight::CompressionCodecLightweight(CompressionDataType data_type_) + : data_type(data_type_) +{} + +UInt8 CompressionCodecLightweight::getMethodByte() const +{ + return static_cast(CompressionMethodByte::Lightweight); +} + +UInt32 CompressionCodecLightweight::getMaxCompressedDataSize(UInt32 uncompressed_size) const +{ + // 1 byte for bytes_size, 1 byte for mode, and the rest for compressed data + return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size); +} + +CompressionCodecLightweight::~CompressionCodecLightweight() +{ + if (ctx.isCompression()) + LOG_INFO(Logger::get(), "lightweight codec: {}", ctx.toDebugString()); +} + +UInt32 CompressionCodecLightweight::doCompressData(const char * source, UInt32 source_size, char * dest) const +{ + dest[0] = magic_enum::enum_integer(data_type); + dest += 1; + switch (data_type) + { + case CompressionDataType::Int8: + return 1 + compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int16: + return 1 + compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int32: + return 1 + compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int64: + return 1 + compressDataForInteger(source, source_size, dest); + case CompressionDataType::Float32: + case CompressionDataType::Float64: + case CompressionDataType::String: + return 1 + compressDataForNonInteger(source, source_size, dest); + default: + throw Exception( + ErrorCodes::CANNOT_COMPRESS, + "Cannot compress lightweight codec data. Invalid data type {}", + magic_enum::enum_name(data_type)); + } +} + +void CompressionCodecLightweight::doDecompressData( + const char * source, + UInt32 source_size, + char * dest, + UInt32 uncompressed_size) const +{ + if unlikely (source_size < 2) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight codec data. File has wrong header"); + + if (uncompressed_size == 0) + return; + + UInt8 bytes_size = source[0]; + auto data_type = magic_enum::enum_cast(bytes_size); + if unlikely (!data_type.has_value()) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight codec data. File has wrong header, unknown data type {}", + bytes_size); + + UInt32 source_size_no_header = source_size - 1; + switch (data_type.value()) + { + case CompressionDataType::Int8: + decompressDataForInteger(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case CompressionDataType::Int16: + decompressDataForInteger(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case CompressionDataType::Int32: + decompressDataForInteger(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case CompressionDataType::Int64: + decompressDataForInteger(&source[1], source_size_no_header, dest, uncompressed_size); + break; + case CompressionDataType::Float32: + case CompressionDataType::Float64: + case CompressionDataType::String: + decompressDataForNonInteger(&source[1], source_size_no_header, dest, uncompressed_size); + break; + } +} + +} // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h similarity index 68% rename from dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h rename to dbms/src/IO/Compression/CompressionCodecLightweight.h index 76b7db18599..927ab466bce 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.h @@ -22,14 +22,20 @@ namespace DB { -class CompressionCodecIntegerLightweight : public ICompressionCodec +/** + * @brief Lightweight compression codec + * For integer data, it supports constant, constant delta, run-length, frame of reference, delta frame of reference, and LZ4. + * For non-integer data, it supports LZ4. + * The codec selects the best mode for each block of data. + */ +class CompressionCodecLightweight : public ICompressionCodec { public: - explicit CompressionCodecIntegerLightweight(UInt8 bytes_size_); + explicit CompressionCodecLightweight(CompressionDataType data_type_); UInt8 getMethodByte() const override; - ~CompressionCodecIntegerLightweight() override; + ~CompressionCodecLightweight() override; protected: UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; @@ -42,7 +48,9 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec bool isGenericCompression() const override { return false; } private: - enum class Mode : UInt8 + /// Integer data + + enum class IntegerMode : UInt8 { Invalid = 0, CONSTANT = 1, // all values are the same @@ -79,26 +87,26 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec // State is a union of different states for different modes template - using State = std::variant, RunLengthState, FORState, DeltaFORState>; + using IntegerState = std::variant, RunLengthState, FORState, DeltaFORState>; - class CompressContext + class IntegerCompressContext { public: - CompressContext() = default; + IntegerCompressContext() = default; bool needAnalyze() const; bool needAnalyzeDelta() const; bool needAnalyzeRunLength() const; template - void analyze(std::span & values, State & state); + void analyze(std::span & values, IntegerState & state); void update(size_t uncompressed_size, size_t compressed_size); String toDebugString() const; bool isCompression() const { return lz4_counter > 0 || lw_counter > 0; } - Mode mode = Mode::LZ4; + IntegerMode mode = IntegerMode::LZ4; private: size_t lw_uncompressed_size = 0; @@ -113,13 +121,19 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec }; template - size_t compressDataForType(const char * source, UInt32 source_size, char * dest) const; + size_t compressDataForInteger(const char * source, UInt32 source_size, char * dest) const; template - void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size) const; + void decompressDataForInteger(const char * source, UInt32 source_size, char * dest, UInt32 output_size) const; + + /// Non-integer data - mutable CompressContext ctx; - const UInt8 bytes_size; + static size_t compressDataForNonInteger(const char * source, UInt32 source_size, char * dest); + static void decompressDataForNonInteger(const char * source, UInt32 source_size, char * dest, UInt32 output_size); + +private: + mutable IntegerCompressContext ctx; + const CompressionDataType data_type; }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Interger.cpp similarity index 63% rename from dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp rename to dbms/src/IO/Compression/CompressionCodecLightweight_Interger.cpp index f2962fcbdbc..e57050f9bcb 100644 --- a/dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Interger.cpp @@ -12,167 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include -#include -#include +#include #include #include -#include -#include +#include #include -#include -#include - - namespace DB { -// TODO: metrics - namespace ErrorCodes { extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; } // namespace ErrorCodes -CompressionCodecIntegerLightweight::CompressionCodecIntegerLightweight(UInt8 bytes_size_) - : bytes_size(bytes_size_) -{} - -UInt8 CompressionCodecIntegerLightweight::getMethodByte() const -{ - return static_cast(CompressionMethodByte::Lightweight); -} - -UInt32 CompressionCodecIntegerLightweight::getMaxCompressedDataSize(UInt32 uncompressed_size) const -{ - // 1 byte for bytes_size, 1 byte for mode, and the rest for compressed data - return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size); -} - -CompressionCodecIntegerLightweight::~CompressionCodecIntegerLightweight() -{ - if (ctx.isCompression()) - LOG_INFO(Logger::get(), "lightweight codec: {}", ctx.toDebugString()); -} - -template -size_t CompressionCodecIntegerLightweight::compressDataForType(const char * source, UInt32 source_size, char * dest) - const -{ - // Load values - const size_t count = source_size / sizeof(T); - std::span values(reinterpret_cast(source), count); - - // Analyze - State state; - ctx.analyze(values, state); - - // Compress - unalignedStore(dest, static_cast(ctx.mode)); - dest += sizeof(UInt8); - size_t compressed_size = 1; - switch (ctx.mode) - { - case Mode::CONSTANT: - { - compressed_size += Compression::constantEncoding(std::get<0>(state), dest); - break; - } - case Mode::CONSTANT_DELTA: - { - compressed_size += Compression::constantDeltaEncoding(values[0], std::get<0>(state), dest); - break; - } - case Mode::RunLength: - { - compressed_size += Compression::runLengthEncoding(std::get<1>(state), dest); - break; - } - case Mode::FOR: - { - FORState for_state = std::get<2>(state); - compressed_size += Compression::FOREncoding(for_state.values, for_state.min_value, for_state.bit_width, dest); - break; - } - case Mode::DELTA_FOR: - { - DeltaFORState delta_for_state = std::get<3>(state); - compressed_size += Compression::FOREncoding, true>( - delta_for_state.deltas, - delta_for_state.min_delta_value, - delta_for_state.bit_width, - dest); - break; - } - case Mode::LZ4: - { - auto success = LZ4_compress_fast( - source, - dest, - source_size, - LZ4_COMPRESSBOUND(source_size), - CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) - throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); - compressed_size += success; - break; - } - default: - throw Exception( - ErrorCodes::CANNOT_COMPRESS, - "Cannot compress with lightweight codec, unknown mode {}", - static_cast(ctx.mode)); - } - - // Update statistics - ctx.update(source_size, compressed_size); - - return compressed_size; -} - -template -void CompressionCodecIntegerLightweight::decompressDataForType( - const char * source, - UInt32 source_size, - char * dest, - UInt32 output_size) const -{ - auto mode = static_cast(unalignedLoad(source)); - source += sizeof(UInt8); - source_size -= sizeof(UInt8); - switch (mode) - { - case Mode::CONSTANT: - Compression::constantDecoding(source, source_size, dest, output_size); - break; - case Mode::CONSTANT_DELTA: - Compression::constantDeltaDecoding(source, source_size, dest, output_size); - break; - case Mode::RunLength: - Compression::runLengthDecoding(source, source_size, dest, output_size); - break; - case Mode::FOR: - Compression::FORDecoding(source, source_size, dest, output_size); - break; - case Mode::DELTA_FOR: - Compression::deltaFORDecoding(source, source_size, dest, output_size); - break; - case Mode::LZ4: - if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) - throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); - break; - default: - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress with lightweight codec, unknown mode {}", - static_cast(mode)); - } -} - -String CompressionCodecIntegerLightweight::CompressContext::toDebugString() const +String CompressionCodecLightweight::IntegerCompressContext::toDebugString() const { return fmt::format( "lz4: {}, lightweight: {}, constant_delta: {}, delta_for: {}, rle: {}, lz4 {} -> {}, lightweight {} -> {}", @@ -187,9 +43,9 @@ String CompressionCodecIntegerLightweight::CompressContext::toDebugString() cons lw_compressed_size); } -void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompressed_size, size_t compressed_size) +void CompressionCodecLightweight::IntegerCompressContext::update(size_t uncompressed_size, size_t compressed_size) { - if (mode == Mode::LZ4) + if (mode == IntegerMode::LZ4) { lz4_uncompressed_size += uncompressed_size; lz4_compressed_size += compressed_size; @@ -201,15 +57,15 @@ void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompre lw_compressed_size += compressed_size; ++lw_counter; } - if (mode == Mode::CONSTANT_DELTA) + if (mode == IntegerMode::CONSTANT_DELTA) ++constant_delta_counter; - if (mode == Mode::DELTA_FOR) + if (mode == IntegerMode::DELTA_FOR) ++delta_for_counter; - if (mode == Mode::RunLength) + if (mode == IntegerMode::RunLength) ++rle_counter; } -bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const +bool CompressionCodecLightweight::IntegerCompressContext::needAnalyze() const { // lightweight codec is never used, do not analyze anymore if (lz4_counter > 5 && lw_counter == 0) @@ -220,28 +76,28 @@ bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const return true; } -bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeDelta() const +bool CompressionCodecLightweight::IntegerCompressContext::needAnalyzeDelta() const { return lw_counter <= 5 || constant_delta_counter != 0 || delta_for_counter != 0; } -bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeRunLength() const +bool CompressionCodecLightweight::IntegerCompressContext::needAnalyzeRunLength() const { return lw_counter <= 5 || rle_counter != 0; } template -void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span & values, State & state) +void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span & values, IntegerState & state) { if (values.empty()) { - mode = Mode::Invalid; + mode = IntegerMode::Invalid; return; } if (!needAnalyze()) { - RUNTIME_CHECK(mode == Mode::LZ4); + RUNTIME_CHECK(mode == IntegerMode::LZ4); return; } @@ -251,7 +107,7 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span(min_delta); - mode = Mode::CONSTANT_DELTA; + mode = IntegerMode::CONSTANT_DELTA; return; } @@ -309,103 +165,189 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::span values_copy(values.begin(), values.end()); state = FORState{std::move(values_copy), min_value, for_width}; - mode = Mode::FOR; + mode = IntegerMode::FOR; } else if (needAnalyzeDelta() && delta_for_size < estimate_lz_size) { state = DeltaFORState{std::move(deltas), min_delta, delta_for_width}; - mode = Mode::DELTA_FOR; + mode = IntegerMode::DELTA_FOR; } else { - mode = Mode::LZ4; + mode = IntegerMode::LZ4; } } -UInt32 CompressionCodecIntegerLightweight::doCompressData(const char * source, UInt32 source_size, char * dest) const +template +size_t CompressionCodecLightweight::compressDataForInteger(const char * source, UInt32 source_size, char * dest) const { + const auto bytes_size = static_cast(data_type); + assert(bytes_size == sizeof(T)); if unlikely (source_size % bytes_size != 0) throw Exception( ErrorCodes::CANNOT_COMPRESS, - "Cannot compress with lightweight codec, data size {} is not aligned to {}", + "Cannot compress with lightweight-integer codec, data size {} is not aligned to {}", source_size, bytes_size); - dest[0] = bytes_size; - dest += 1; - switch (bytes_size) + // Load values + const size_t count = source_size / bytes_size; + std::span values(reinterpret_cast(source), count); + + // Analyze + IntegerState state; + ctx.analyze(values, state); + + // Compress + unalignedStore(dest, static_cast(ctx.mode)); + dest += sizeof(UInt8); + size_t compressed_size = 1; + switch (ctx.mode) + { + case IntegerMode::CONSTANT: { - case 1: - return 1 + compressDataForType(source, source_size, dest); - case 2: - return 1 + compressDataForType(source, source_size, dest); - case 4: - return 1 + compressDataForType(source, source_size, dest); - case 8: - return 1 + compressDataForType(source, source_size, dest); + compressed_size += Compression::constantEncoding(std::get<0>(state), dest); + break; + } + case IntegerMode::CONSTANT_DELTA: + { + compressed_size += Compression::constantDeltaEncoding(values[0], std::get<0>(state), dest); + break; + } + case IntegerMode::RunLength: + { + compressed_size += Compression::runLengthEncoding(std::get<1>(state), dest); + break; + } + case IntegerMode::FOR: + { + FORState for_state = std::get<2>(state); + compressed_size += Compression::FOREncoding(for_state.values, for_state.min_value, for_state.bit_width, dest); + break; + } + case IntegerMode::DELTA_FOR: + { + DeltaFORState delta_for_state = std::get<3>(state); + compressed_size += Compression::FOREncoding, true>( + delta_for_state.deltas, + delta_for_state.min_delta_value, + delta_for_state.bit_width, + dest); + break; + } + case IntegerMode::LZ4: + { + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + compressed_size += success; + break; + } default: throw Exception( ErrorCodes::CANNOT_COMPRESS, - "Cannot compress with lightweight codec, unknown bytes size {}", - bytes_size); + "Cannot compress with lightweight-integer codec, unknown mode {}", + static_cast(ctx.mode)); } + + // Update statistics + ctx.update(source_size, compressed_size); + + return compressed_size; } -void CompressionCodecIntegerLightweight::doDecompressData( +template +void CompressionCodecLightweight::decompressDataForInteger( const char * source, UInt32 source_size, char * dest, - UInt32 uncompressed_size) const + UInt32 output_size) const { - if unlikely (source_size < 2) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress lightweight-encoded data. File has wrong header"); - - if (uncompressed_size == 0) - return; - - UInt8 bytes_size = source[0]; - - if unlikely (bytes_size != 1 && bytes_size != 2 && bytes_size != 4 && bytes_size != 8) + if unlikely (output_size % sizeof(T) != 0) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress lightweight-encoded data. File has wrong header"); + "Cannot decompress lightweight-integer codec data. Uncompressed size {} is not aligned to {}", + output_size, + sizeof(T)); - if unlikely (uncompressed_size % bytes_size != 0) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress lightweight-encoded data. Uncompressed size {} is not aligned to {}", - uncompressed_size, - bytes_size); - - UInt32 source_size_no_header = source_size - 1; - switch (bytes_size) + auto mode = static_cast(unalignedLoad(source)); + source += sizeof(UInt8); + source_size -= sizeof(UInt8); + switch (mode) { - case 1: - decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + case IntegerMode::CONSTANT: + Compression::constantDecoding(source, source_size, dest, output_size); + break; + case IntegerMode::CONSTANT_DELTA: + Compression::constantDeltaDecoding(source, source_size, dest, output_size); break; - case 2: - decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + case IntegerMode::RunLength: + Compression::runLengthDecoding(source, source_size, dest, output_size); + break; + case IntegerMode::FOR: + Compression::FORDecoding(source, source_size, dest, output_size); break; - case 4: - decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + case IntegerMode::DELTA_FOR: + Compression::deltaFORDecoding(source, source_size, dest, output_size); break; - case 8: - decompressDataForType(&source[1], source_size_no_header, dest, uncompressed_size); + case IntegerMode::LZ4: + if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); break; default: throw Exception( ErrorCodes::CANNOT_DECOMPRESS, - "Cannot compress with lightweight codec, unknown bytes size {}", - bytes_size); + "Cannot decompress with lightweight-integer codec, unknown mode {}", + static_cast(mode)); } } +template size_t CompressionCodecLightweight::compressDataForInteger( + const char * source, + UInt32 source_size, + char * dest) const; +template size_t CompressionCodecLightweight::compressDataForInteger( + const char * source, + UInt32 source_size, + char * dest) const; +template size_t CompressionCodecLightweight::compressDataForInteger( + const char * source, + UInt32 source_size, + char * dest) const; +template size_t CompressionCodecLightweight::compressDataForInteger( + const char * source, + UInt32 source_size, + char * dest) const; +template void CompressionCodecLightweight::decompressDataForInteger( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) const; +template void CompressionCodecLightweight::decompressDataForInteger( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) const; +template void CompressionCodecLightweight::decompressDataForInteger( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) const; +template void CompressionCodecLightweight::decompressDataForInteger( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) const; + } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp new file mode 100644 index 00000000000..816d8a00b7b --- /dev/null +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp @@ -0,0 +1,53 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int CANNOT_COMPRESS; +extern const int CANNOT_DECOMPRESS; +} // namespace ErrorCodes + +size_t CompressionCodecLightweight::compressDataForNonInteger(const char * source, UInt32 source_size, char * dest) +{ + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + return success; +} + + +void CompressionCodecLightweight::decompressDataForNonInteger( + const char * source, + UInt32 source_size, + char * dest, + UInt32 output_size) +{ + if (unlikely(LZ4_decompress_safe(source, dest, source_size, output_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); +} + +} // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index ed438ea7ce7..e8782c176c2 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -15,9 +15,13 @@ #include #include #include +#include #include #include #include +#include + +#include namespace DB @@ -29,8 +33,8 @@ extern const int CANNOT_COMPRESS; extern const int CANNOT_DECOMPRESS; } // namespace ErrorCodes -CompressionCodecRunLength::CompressionCodecRunLength(UInt8 bytes_size_) - : bytes_size(bytes_size_) +CompressionCodecRunLength::CompressionCodecRunLength(CompressionDataType data_type_) + : data_type(data_type_) {} UInt8 CompressionCodecRunLength::getMethodByte() const @@ -40,18 +44,15 @@ UInt8 CompressionCodecRunLength::getMethodByte() const UInt32 CompressionCodecRunLength::getMaxCompressedDataSize(UInt32 uncompressed_size) const { - // If the encoded data is larger than the original data, we will store the original data - // Additional byte is used to store the size of the data type - return 1 + uncompressed_size; + return 1 + LZ4_COMPRESSBOUND(uncompressed_size); } -namespace -{ -constexpr UInt8 JUST_COPY_CODE = 0xFF; - template -UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) +UInt32 CompressionCodecRunLength::compressDataForInteger(const char * source, UInt32 source_size, char * dest) const { + constexpr auto bytes_size = sizeof(T); + if unlikely (source_size % bytes_size != 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); const char * source_end = source + source_size; DB::Compression::RunLengthPairs rle_vec; rle_vec.reserve(source_size / sizeof(T)); @@ -67,34 +68,47 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest) if (DB::Compression::runLengthPairsSize(rle_vec) >= source_size) { - dest[0] = JUST_COPY_CODE; - memcpy(&dest[1], source, source_size); - return 1 + source_size; + // treat as string + dest[0] = magic_enum::enum_integer(CompressionDataType::String); + dest += 1; + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + return 1 + success; } - dest[0] = sizeof(T); + dest[0] = magic_enum::enum_integer(data_type); dest += 1; return 1 + DB::Compression::runLengthEncoding(rle_vec, dest); } -} // namespace - UInt32 CompressionCodecRunLength::doCompressData(const char * source, UInt32 source_size, char * dest) const { - if unlikely (source_size % bytes_size != 0) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); - switch (bytes_size) + switch (data_type) { - case 1: - return compressDataForType(source, source_size, dest); - case 2: - return compressDataForType(source, source_size, dest); - case 4: - return compressDataForType(source, source_size, dest); - case 8: - return compressDataForType(source, source_size, dest); + case CompressionDataType::Int8: + return compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int16: + return compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int32: + return compressDataForInteger(source, source_size, dest); + case CompressionDataType::Int64: + return compressDataForInteger(source, source_size, dest); default: - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress RunLength-encoded data. Unsupported bytes size"); + auto success = LZ4_compress_fast( + source, + dest, + source_size, + LZ4_COMPRESSBOUND(source_size), + CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); + if (!success) + throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); + return 1 + success; } } @@ -113,42 +127,27 @@ void CompressionCodecRunLength::doDecompressData( return; UInt8 bytes_size = source[0]; - if (bytes_size == JUST_COPY_CODE) - { - if (source_size - 1 < uncompressed_size) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress RunLength-encoded data. File has wrong header"); + auto data_type = magic_enum::enum_cast(bytes_size); + RUNTIME_CHECK(data_type.has_value()); - memcpy(dest, &source[1], uncompressed_size); - return; - } - - if unlikely (uncompressed_size % bytes_size != 0) - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "uncompressed size {} is not aligned to {}", - uncompressed_size, - bytes_size); - - switch (bytes_size) + switch (data_type.value()) { - case 1: + case CompressionDataType::Int8: DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; - case 2: + case CompressionDataType::Int16: DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; - case 4: + case CompressionDataType::Int32: DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; - case 8: + case CompressionDataType::Int64: DB::Compression::runLengthDecoding(&source[1], source_size - 1, dest, uncompressed_size); break; default: - throw Exception( - ErrorCodes::CANNOT_DECOMPRESS, - "Cannot decompress RunLength-encoded data. Unsupported bytes size"); + if (unlikely(LZ4_decompress_safe(&source[1], dest, source_size - 1, uncompressed_size) < 0)) + throw Exception("Cannot LZ4_decompress_safe", ErrorCodes::CANNOT_DECOMPRESS); + break; } } diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.h b/dbms/src/IO/Compression/CompressionCodecRunLength.h index c3d38090346..86a401765a0 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.h +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.h @@ -22,7 +22,7 @@ namespace DB class CompressionCodecRunLength : public ICompressionCodec { public: - explicit CompressionCodecRunLength(UInt8 bytes_size_); + explicit CompressionCodecRunLength(CompressionDataType data_type_); UInt8 getMethodByte() const override; @@ -37,7 +37,11 @@ class CompressionCodecRunLength : public ICompressionCodec bool isGenericCompression() const override { return false; } private: - const UInt8 bytes_size; + template + UInt32 compressDataForInteger(const char * source, UInt32 source_size, char * dest) const; + +private: + const CompressionDataType data_type; }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionFactory.h b/dbms/src/IO/Compression/CompressionFactory.h index 06a458a5144..5798ac72f5e 100644 --- a/dbms/src/IO/Compression/CompressionFactory.h +++ b/dbms/src/IO/Compression/CompressionFactory.h @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -51,7 +51,7 @@ class CompressionFactory case CompressionMethod::ZSTD: return std::make_unique(setting.level); case CompressionMethod::Lightweight: - return std::make_unique(setting.type_bytes_size); + return std::make_unique(setting.type_bytes_size); #if USE_QPL case CompressionMethod::QPL: return std::make_unique(); diff --git a/dbms/src/IO/Compression/CompressionInfo.h b/dbms/src/IO/Compression/CompressionInfo.h index c8b59c974b7..31c631c3291 100644 --- a/dbms/src/IO/Compression/CompressionInfo.h +++ b/dbms/src/IO/Compression/CompressionInfo.h @@ -66,4 +66,15 @@ enum class CompressionMethodByte : UInt8 }; // clang-format on +enum class CompressionDataType : UInt8 +{ + Int8 = 1, // Int8/UInt8 + Int16 = 2, // Int16/UInt16 + Int32 = 4, // Int32/UInt32 + Int64 = 8, // Int64/UInt64 + Float32 = 9, + Float64 = 10, + String = 11, +}; + } // namespace DB \ No newline at end of file diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index 54bf73714da..827f66423a6 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -50,7 +50,7 @@ struct CompressionSetting CompressionMethod method; CompressionMethodByte method_byte; int level; - UInt8 type_bytes_size = 1; + CompressionDataType type_bytes_size = CompressionDataType::String; CompressionSetting() : CompressionSetting(CompressionMethod::LZ4) diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index 2632e80b65a..0d91ec188e7 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -186,6 +186,13 @@ void applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { + UInt8 bytes_size = sizeof(T); + if unlikely (dest_size % bytes_size != 0) + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "uncompressed size {} is not aligned to {}", + dest_size, + bytes_size); const auto count = dest_size / sizeof(T); T frame_of_reference = unalignedLoad(src); src += sizeof(T); diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index 0c458dfdaeb..cf09fbcd9de 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -351,7 +351,7 @@ CodecTestSequence generateSeq(Generator gen, const char * gen_name, B Begin = 0, CompressionCodecPtr makeCodec(const CompressionMethodByte method_byte, UInt8 type_byte) { CompressionSetting setting(method_byte); - setting.type_bytes_size = type_byte; + setting.type_bytes_size = magic_enum::enum_cast(type_byte).value(); return CompressionFactory::create(setting); } @@ -534,7 +534,7 @@ std::vector generatePyramidOfSequences( const auto IntegerCodecsToTest = ::testing::Values( CompressionMethodByte::Lightweight, CompressionMethodByte::DeltaFOR, - // CompressionMethodByte::FOR, // disable FOR codec for now, since there are too many unit tests. + CompressionMethodByte::FOR, CompressionMethodByte::RunLength #if USE_QPL , diff --git a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h index f0ee9bf8606..d85185bc729 100644 --- a/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h +++ b/dbms/src/Storages/DeltaMerge/File/DMFileWriter.h @@ -65,25 +65,12 @@ class DMFileWriter /*flags*/ -1, /*mode*/ 0666, max_compress_block_size)) + , compressed_buf(CompressedWriteBuffer<>::build( + *plain_file, + compression_settings, + !dmfile->getConfiguration().has_value())) , minmaxes(do_index ? std::make_shared(*type) : nullptr) { - // TODO: better, now only for test - if (type->isInteger()) - { - assert(compression_settings.settings.size() == 1); - CompressionSettings settings(CompressionMethod::Lightweight); - auto & setting = settings.settings[0]; - setting.type_bytes_size = type->getSizeOfValueInMemory(); - compressed_buf = CompressedWriteBuffer<>::build(*plain_file, settings, !dmfile->getConfiguration()); - } - else - { - compressed_buf = CompressedWriteBuffer<>::build( // - *plain_file, - compression_settings, - !dmfile->getConfiguration()); - } - if (!dmfile->useMetaV2()) { // will not used in DMFileFormat::V3, could be removed when v3 is default From e993bb0f7b934067b671896c8b57c4ca93c0548a Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Fri, 21 Jun 2024 16:22:38 +0800 Subject: [PATCH 11/27] rename Signed-off-by: Lloyd-Pottiger --- ...eight_Interger.cpp => CompressionCodecLightweight_Integer.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dbms/src/IO/Compression/{CompressionCodecLightweight_Interger.cpp => CompressionCodecLightweight_Integer.cpp} (100%) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Interger.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp similarity index 100% rename from dbms/src/IO/Compression/CompressionCodecLightweight_Interger.cpp rename to dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp From 00da398b3f4aee99f93a823f2affc04d05c1bfcc Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 24 Jun 2024 10:48:01 +0800 Subject: [PATCH 12/27] refine Signed-off-by: Lloyd-Pottiger --- .../Compression/CompressionCodecLightweight.h | 8 ++++++ .../CompressionCodecLightweight_Integer.cpp | 26 +++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h index 927ab466bce..3fa893d0ee8 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.h @@ -109,6 +109,14 @@ class CompressionCodecLightweight : public ICompressionCodec IntegerMode mode = IntegerMode::LZ4; private: + // The threshold for the number of blocks to decide whether need to analyze. + // For example: + // If lz4 is used more than COUNT_THRESHOLD times and the compression ratio is better than lightweight codec, do not analyze anymore. + static constexpr size_t COUNT_THRESHOLD = 5; + // Assume that the compression ratio of LZ4 is 3.0 + // The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4 + static constexpr size_t ESRTIMATE_LZ4_COMPRESSION_RATIO = 3; + size_t lw_uncompressed_size = 0; size_t lw_compressed_size = 0; size_t lw_counter = 0; diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index e57050f9bcb..45fc51aecd7 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -16,7 +16,6 @@ #include #include #include -#include #include namespace DB @@ -68,22 +67,23 @@ void CompressionCodecLightweight::IntegerCompressContext::update(size_t uncompre bool CompressionCodecLightweight::IntegerCompressContext::needAnalyze() const { // lightweight codec is never used, do not analyze anymore - if (lz4_counter > 5 && lw_counter == 0) + if (lz4_counter > COUNT_THRESHOLD && lw_counter == 0) return false; - // if lz4 is used more than 5 times and the compression ratio is better than lightweight codec, do not analyze anymore - if (lz4_counter > 5 && lz4_uncompressed_size / lz4_compressed_size > lw_compressed_size / lw_uncompressed_size) + // if lz4 is used more than COUNT_THRESHOLD times and the compression ratio is better than lightweight codec, do not analyze anymore + if (lz4_counter > COUNT_THRESHOLD + && lz4_uncompressed_size / lz4_compressed_size > lw_compressed_size / lw_uncompressed_size) return false; return true; } bool CompressionCodecLightweight::IntegerCompressContext::needAnalyzeDelta() const { - return lw_counter <= 5 || constant_delta_counter != 0 || delta_for_counter != 0; + return lw_counter <= COUNT_THRESHOLD || constant_delta_counter != 0 || delta_for_counter != 0; } bool CompressionCodecLightweight::IntegerCompressContext::needAnalyzeRunLength() const { - return lw_counter <= 5 || rle_counter != 0; + return lw_counter <= COUNT_THRESHOLD || rle_counter != 0; } template @@ -101,6 +101,9 @@ void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span(max_value - min_value); - // additional T bytes for min_value, and 1 byte for width - size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + sizeof(T) + sizeof(UInt8); - // Assume that the compression ratio of LZ4 is 3.0 - // The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4 - size_t estimate_lz_size = values.size() * sizeof(T) / 3; + size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + ADDTIONAL_BYTES; + size_t estimate_lz_size = values.size() * sizeof(T) / ESRTIMATE_LZ4_COMPRESSION_RATIO; size_t rle_size = rle.empty() ? std::numeric_limits::max() : Compression::runLengthPairsSize(rle); if (needAnalyzeRunLength() && rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size) { From 3cf3f1ed3a98a666505e47e147084857d3342fd5 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 24 Jun 2024 15:09:40 +0800 Subject: [PATCH 13/27] refine Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionFactory.h | 31 +++++------ dbms/src/IO/Compression/CompressionSettings.h | 2 +- .../tests/gtest_codec_compression.cpp | 53 +++++-------------- 3 files changed, 27 insertions(+), 59 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionFactory.h b/dbms/src/IO/Compression/CompressionFactory.h index 5798ac72f5e..a12b8896929 100644 --- a/dbms/src/IO/Compression/CompressionFactory.h +++ b/dbms/src/IO/Compression/CompressionFactory.h @@ -42,31 +42,29 @@ class CompressionFactory public: static CompressionCodecPtr create(const CompressionSetting & setting) { - switch (setting.method) + // LZ4 and LZ4HC have the same format, the difference is only in compression. + // So they have the same method byte. + if (setting.method == CompressionMethod::LZ4HC) + return std::make_unique(setting.level); + + switch (setting.method_byte) { - case CompressionMethod::LZ4: + case CompressionMethodByte::LZ4: return std::make_unique(setting.level); - case CompressionMethod::LZ4HC: - return std::make_unique(setting.level); - case CompressionMethod::ZSTD: + case CompressionMethodByte::ZSTD: return std::make_unique(setting.level); - case CompressionMethod::Lightweight: - return std::make_unique(setting.type_bytes_size); #if USE_QPL - case CompressionMethod::QPL: + case CompressionMethodByte::QPL: return std::make_unique(); #endif - default: - break; - } - switch (setting.method_byte) - { + case CompressionMethodByte::Lightweight: + return std::make_unique(setting.data_type); case CompressionMethodByte::DeltaFOR: - return std::make_unique(setting.type_bytes_size); + return std::make_unique(setting.data_type); case CompressionMethodByte::RunLength: - return std::make_unique(setting.type_bytes_size); + return std::make_unique(setting.data_type); case CompressionMethodByte::FOR: - return std::make_unique(setting.type_bytes_size); + return std::make_unique(setting.data_type); case CompressionMethodByte::NONE: return std::make_unique(); default: @@ -96,7 +94,6 @@ class CompressionFactory private: static Codecs createCodecs(const CompressionSettings & settings) { - RUNTIME_CHECK(settings.settings.size() > 1); Codecs codecs; codecs.reserve(settings.settings.size()); for (const auto & setting : settings.settings) diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index 827f66423a6..5363b0aca5d 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -50,7 +50,7 @@ struct CompressionSetting CompressionMethod method; CompressionMethodByte method_byte; int level; - CompressionDataType type_bytes_size = CompressionDataType::String; + CompressionDataType data_type = CompressionDataType::String; CompressionSetting() : CompressionSetting(CompressionMethod::LZ4) diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index cf09fbcd9de..35f80b2a296 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -351,7 +351,7 @@ CodecTestSequence generateSeq(Generator gen, const char * gen_name, B Begin = 0, CompressionCodecPtr makeCodec(const CompressionMethodByte method_byte, UInt8 type_byte) { CompressionSetting setting(method_byte); - setting.type_bytes_size = magic_enum::enum_cast(type_byte).value(); + setting.data_type = magic_enum::enum_cast(type_byte).value(); return CompressionFactory::create(setting); } @@ -362,18 +362,21 @@ void testTranscoding(ICompressionCodec & codec, const CodecTestSequence & test_s const UInt32 encoded_max_size = codec.getCompressedReserveSize(static_cast(source_data.size())); PODArray encoded(encoded_max_size); - assert(source_data.data() != nullptr); // Codec assumes that source buffer is not null. - const UInt32 encoded_size - = codec.compress(source_data.data(), static_cast(source_data.size()), encoded.data()); - + ASSERT_TRUE(source_data.data() != nullptr); // Codec assumes that source buffer is not null. + const UInt32 encoded_size = codec.compress( // + source_data.data(), + static_cast(source_data.size()), + encoded.data()); encoded.resize(encoded_size); - PODArray decoded(source_data.size()); - - const auto decoded_size = codec.readDecompressedBlockSize(encoded.data()); + auto method_byte = ICompressionCodec::readMethod(encoded.data()); + ASSERT_EQ(method_byte, codec.getMethodByte()); - codec.decompress(encoded.data(), static_cast(encoded.size()), decoded.data(), decoded_size); + PODArray decoded(source_data.size()); + const auto decode_codec = CompressionFactory::createForDecompress(method_byte); + const auto decoded_size = decode_codec->readDecompressedBlockSize(encoded.data()); + decode_codec->decompress(encoded.data(), static_cast(encoded.size()), decoded.data(), decoded_size); decoded.resize(decoded_size); ASSERT_TRUE(EqualByteContainers(test_sequence.data_type->getSizeOfValueInMemory(), source_data, decoded)); @@ -546,38 +549,6 @@ const auto IntegerCodecsToTest = ::testing::Values( // test cases /////////////////////////////////////////////////////////////////////////////////////////////////// -// INSTANTIATE_TEST_CASE_P( -// Simple, -// CodecTest, -// ::testing::Combine( -// IntegerCodecsToTest, -// ::testing::Values(makeSeq( -// 1, -// 2, -// 3, -// 5, -// 7, -// 11, -// 13, -// 17, -// 23, -// 29, -// 31, -// 37, -// 41, -// 43, -// 47, -// 53, -// 59, -// 61, -// 67, -// 71, -// 73, -// 79, -// 83, -// 89, -// 97)))); - INSTANTIATE_TEST_CASE_P( SmallSequences, MultipleSequencesCodecTest, From 4b5afc19976ac53de50abbc50d686f82f216c967 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 24 Jun 2024 16:15:46 +0800 Subject: [PATCH 14/27] address comments Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionCodecRunLength.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index e8782c176c2..ff1dd44649e 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -44,6 +44,8 @@ UInt8 CompressionCodecRunLength::getMethodByte() const UInt32 CompressionCodecRunLength::getMaxCompressedDataSize(UInt32 uncompressed_size) const { + // If the data is not compressible as run-length encoding, we will compress it as LZ4. + // 1 byte for data type, and the rest for LZ4 compressed data. return 1 + LZ4_COMPRESSBOUND(uncompressed_size); } @@ -55,8 +57,8 @@ UInt32 CompressionCodecRunLength::compressDataForInteger(const char * source, UI throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); const char * source_end = source + source_size; DB::Compression::RunLengthPairs rle_vec; - rle_vec.reserve(source_size / sizeof(T)); - for (const auto * src = source; src < source_end; src += sizeof(T)) + rle_vec.reserve(source_size / bytes_size); + for (const auto * src = source; src < source_end; src += bytes_size) { T value = unalignedLoad(src); if (rle_vec.empty() || rle_vec.back().first != value From d029c532045724f722e9b66d8855407581c0d6d5 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 24 Jun 2024 17:52:55 +0800 Subject: [PATCH 15/27] address comments Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionCodecRunLength.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index ff1dd44649e..c6aa45065fb 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -61,6 +61,9 @@ UInt32 CompressionCodecRunLength::compressDataForInteger(const char * source, UI for (const auto * src = source; src < source_end; src += bytes_size) { T value = unalignedLoad(src); + // If the value is different from the previous one or the counter is at the maximum value (255 + 1 = 0), + // we need to start a new run. + // Otherwise, we can just increment the counter. if (rle_vec.empty() || rle_vec.back().first != value || rle_vec.back().second == std::numeric_limits::max()) rle_vec.emplace_back(value, 1); From 86d6c2fd9643c2a39cd4251379fe640d01c7d4f6 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:57:26 +0800 Subject: [PATCH 16/27] Apply suggestions from code review Co-authored-by: jinhelin --- dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp | 4 ++-- dbms/src/IO/Compression/CompressionCodecDeltaFOR.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index 099dfd900ba..0305b855e39 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -104,7 +104,7 @@ UInt32 CompressionCodecDeltaFOR::doCompressData(const char * source, UInt32 sour source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); return 1 + success; } @@ -161,7 +161,7 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress DeltaFor-encoded data. File has wrong header"); - if (uncompressed_size == 0) + if (unlikely(uncompressed_size == 0)) return; UInt8 bytes_size = source[0]; diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h index 5faf713e864..d4fc5f62b6d 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h @@ -26,6 +26,7 @@ class CompressionCodecDeltaFOR : public ICompressionCodec UInt8 getMethodByte() const override; + // ordinaryDecompress is only used for benchmark comparison. static void ordinaryDecompress(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size); #ifndef DBMS_PUBLIC_GTEST From 9cf8b83816655c8be7664a712b0ee3f3149cc51f Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Fri, 28 Jun 2024 13:28:59 +0800 Subject: [PATCH 17/27] format Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp | 2 +- dbms/src/IO/Compression/CompressionCodecFOR.cpp | 4 ++-- dbms/src/IO/Compression/CompressionCodecLZ4.cpp | 2 +- dbms/src/IO/Compression/CompressionCodecLightweight.cpp | 2 +- .../IO/Compression/CompressionCodecLightweight_Integer.cpp | 2 +- .../Compression/CompressionCodecLightweight_NonInteger.cpp | 2 +- dbms/src/IO/Compression/CompressionCodecRunLength.cpp | 6 +++--- dbms/src/IO/Compression/CompressionInfo.h | 2 +- dbms/src/Storages/KVStore/FFI/SSTReader.h | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index 0305b855e39..388d463c82a 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -121,7 +121,7 @@ void CompressionCodecDeltaFOR::doDecompressData( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress DeltaFor-encoded data. File has wrong header"); - if (uncompressed_size == 0) + if (unlikely(uncompressed_size == 0)) return; UInt8 bytes_size = source[0]; diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index 881232b8155..3c3b380946c 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -99,7 +99,7 @@ UInt32 CompressionCodecFOR::doCompressData(const char * source, UInt32 source_si source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); return 1 + success; } @@ -114,7 +114,7 @@ void CompressionCodecFOR::doDecompressData( if unlikely (source_size < 2) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress For-encoded data. File has wrong header"); - if (uncompressed_size == 0) + if (unlikely(uncompressed_size == 0)) return; UInt8 bytes_size = source[0]; diff --git a/dbms/src/IO/Compression/CompressionCodecLZ4.cpp b/dbms/src/IO/Compression/CompressionCodecLZ4.cpp index 7a7e91c6c97..5f0aa8719ae 100644 --- a/dbms/src/IO/Compression/CompressionCodecLZ4.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLZ4.cpp @@ -62,7 +62,7 @@ UInt32 CompressionCodecLZ4HC::doCompressData(const char * source, UInt32 source_ { auto success = LZ4_compress_HC(source, dest, source_size, LZ4_COMPRESSBOUND(source_size), level); - if (!success) + if (unlikely(!success)) throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with LZ4 codec"); return success; diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp index efa0b77309d..df1ea3e0b3f 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp @@ -89,7 +89,7 @@ void CompressionCodecLightweight::doDecompressData( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress lightweight codec data. File has wrong header"); - if (uncompressed_size == 0) + if (unlikely(uncompressed_size == 0)) return; UInt8 bytes_size = source[0]; diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index 45fc51aecd7..1de05fbec82 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -247,7 +247,7 @@ size_t CompressionCodecLightweight::compressDataForInteger(const char * source, source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); compressed_size += success; break; diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp index 816d8a00b7b..efe669a2825 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_NonInteger.cpp @@ -34,7 +34,7 @@ size_t CompressionCodecLightweight::compressDataForNonInteger(const char * sourc source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); return success; } diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index c6aa45065fb..08dbd908436 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -82,7 +82,7 @@ UInt32 CompressionCodecRunLength::compressDataForInteger(const char * source, UI source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); return 1 + success; } @@ -111,7 +111,7 @@ UInt32 CompressionCodecRunLength::doCompressData(const char * source, UInt32 sou source_size, LZ4_COMPRESSBOUND(source_size), CompressionSetting::getDefaultLevel(CompressionMethod::LZ4)); - if (!success) + if (unlikely(!success)) throw Exception("Cannot LZ4_compress_fast", ErrorCodes::CANNOT_COMPRESS); return 1 + success; } @@ -128,7 +128,7 @@ void CompressionCodecRunLength::doDecompressData( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress RunLength-encoded data. File has wrong header"); - if (uncompressed_size == 0) + if (unlikely(uncompressed_size == 0)) return; UInt8 bytes_size = source[0]; diff --git a/dbms/src/IO/Compression/CompressionInfo.h b/dbms/src/IO/Compression/CompressionInfo.h index 31c631c3291..f24e99741c6 100644 --- a/dbms/src/IO/Compression/CompressionInfo.h +++ b/dbms/src/IO/Compression/CompressionInfo.h @@ -77,4 +77,4 @@ enum class CompressionDataType : UInt8 String = 11, }; -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/Storages/KVStore/FFI/SSTReader.h b/dbms/src/Storages/KVStore/FFI/SSTReader.h index 24552eabd86..46195e216d1 100644 --- a/dbms/src/Storages/KVStore/FFI/SSTReader.h +++ b/dbms/src/Storages/KVStore/FFI/SSTReader.h @@ -48,7 +48,7 @@ class MonoSSTReader : public SSTReader BaseBuffView keyView() const override; BaseBuffView valueView() const override; void next() override; - SSTFormatKind sstFormatKind() const { return kind; }; + SSTFormatKind sstFormatKind() const { return kind; } size_t approxSize() const override; std::vector findSplitKeys(uint64_t splits_count) const override; void seek(BaseBuffView && view) const override; From c83a8249c418006fc06300b1b0525be13086e571 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 1 Jul 2024 16:49:55 +0800 Subject: [PATCH 18/27] address comments Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionCodecLightweight.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h index 3fa893d0ee8..39a00ef292d 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.h @@ -94,10 +94,6 @@ class CompressionCodecLightweight : public ICompressionCodec public: IntegerCompressContext() = default; - bool needAnalyze() const; - bool needAnalyzeDelta() const; - bool needAnalyzeRunLength() const; - template void analyze(std::span & values, IntegerState & state); @@ -108,6 +104,11 @@ class CompressionCodecLightweight : public ICompressionCodec IntegerMode mode = IntegerMode::LZ4; + private: + bool needAnalyze() const; + bool needAnalyzeDelta() const; + bool needAnalyzeRunLength() const; + private: // The threshold for the number of blocks to decide whether need to analyze. // For example: From bf8c288b42ddeecb88989ad243a330e65fa5551e Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Wed, 3 Jul 2024 15:05:54 +0800 Subject: [PATCH 19/27] address comments & fix DeltaFor Signed-off-by: Lloyd-Pottiger --- .../Compression/CompressionCodecDeflateQpl.h | 3 -- .../Compression/CompressionCodecDeltaFOR.cpp | 18 ++++++--- .../IO/Compression/CompressionCodecDeltaFOR.h | 3 -- dbms/src/IO/Compression/CompressionCodecFOR.h | 3 -- dbms/src/IO/Compression/CompressionCodecLZ4.h | 3 -- .../CompressionCodecLightweight.cpp | 5 +++ .../Compression/CompressionCodecLightweight.h | 3 -- .../CompressionCodecLightweight_Integer.cpp | 26 ++++++++----- .../Compression/CompressionCodecMultiple.cpp | 8 ---- .../IO/Compression/CompressionCodecMultiple.h | 3 -- .../src/IO/Compression/CompressionCodecNone.h | 3 -- .../Compression/CompressionCodecRunLength.h | 3 -- .../src/IO/Compression/CompressionCodecZSTD.h | 3 -- dbms/src/IO/Compression/EncodingUtil.cpp | 38 +++++++++++++++---- dbms/src/IO/Compression/EncodingUtil.h | 7 +++- dbms/src/IO/Compression/ICompressionCodec.h | 6 --- 16 files changed, 70 insertions(+), 65 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h b/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h index df01503cb14..131b1a21757 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h +++ b/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h @@ -103,9 +103,6 @@ class CompressionCodecDeflateQpl final : public ICompressionCodec UInt8 getMethodByte() const override; protected: - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return true; } - UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index 388d463c82a..f3e13490683 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -52,11 +52,11 @@ UInt32 CompressionCodecDeltaFOR::getMaxCompressedDataSize(UInt32 uncompressed_si case CompressionDataType::Int32: case CompressionDataType::Int64: { - // |bytes_of_original_type|frame_of_reference|width(bits) |bitpacked data| - // |1 bytes |bytes_size |sizeof(UInt8)|required size | + // |bytes_of_original_type|first_value|frame_of_reference|width(bits) |bitpacked data| + // |1 bytes |bytes_size |bytes_size |sizeof(UInt8)|required size | auto bytes_size = magic_enum::enum_integer(data_type); - const size_t count = uncompressed_size / bytes_size; - return 1 + bytes_size + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(count, bytes_size * 8); + const size_t deltas_count = uncompressed_size / bytes_size - 1; + return 1 + bytes_size * 2 + sizeof(UInt8) + BitpackingPrimitives::getRequiredSize(deltas_count, bytes_size * 8); } default: return 1 + LZ4_COMPRESSBOUND(uncompressed_size); @@ -72,13 +72,19 @@ UInt32 compressData(const char * source, UInt32 source_size, char * dest) constexpr auto bytes_size = sizeof(T); if unlikely (source_size % bytes_size != 0) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size); - const auto count = source_size / sizeof(T); + const auto count = source_size / bytes_size; DB::Compression::deltaEncoding(reinterpret_cast(source), count, reinterpret_cast(dest)); + if (unlikely(count == 1)) + return bytes_size; // Cast deltas to signed type to better compress negative values. // For example, if we have a sequence of UInt8 values [3, 2, 1, 0], the deltas will be [3, -1, -1, -1] // If we compress them as UInt8, we will get [3, 255, 255, 255], which is not optimal. using TS = typename std::make_signed::type; - return DB::CompressionCodecFOR::compressData(reinterpret_cast(dest), source_size, dest); + auto for_size = DB::CompressionCodecFOR::compressData( + reinterpret_cast(dest + bytes_size), + source_size - bytes_size, + dest + bytes_size); + return bytes_size + for_size; } } // namespace diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h index d4fc5f62b6d..9dc9687f152 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.h @@ -39,9 +39,6 @@ class CompressionCodecDeltaFOR : public ICompressionCodec UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return false; } - private: const CompressionDataType data_type; }; diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.h b/dbms/src/IO/Compression/CompressionCodecFOR.h index 824c36276cf..3112ab65806 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.h +++ b/dbms/src/IO/Compression/CompressionCodecFOR.h @@ -47,9 +47,6 @@ class CompressionCodecFOR : public ICompressionCodec UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return false; } - private: const CompressionDataType data_type; }; diff --git a/dbms/src/IO/Compression/CompressionCodecLZ4.h b/dbms/src/IO/Compression/CompressionCodecLZ4.h index 70ae9048d5a..4eda28ac714 100644 --- a/dbms/src/IO/Compression/CompressionCodecLZ4.h +++ b/dbms/src/IO/Compression/CompressionCodecLZ4.h @@ -29,9 +29,6 @@ class CompressionCodecLZ4 : public ICompressionCodec protected: UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return true; } - private: void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp index df1ea3e0b3f..51a972340f8 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp @@ -120,6 +120,11 @@ void CompressionCodecLightweight::doDecompressData( case CompressionDataType::String: decompressDataForNonInteger(&source[1], source_size_no_header, dest, uncompressed_size); break; + default: + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot decompress lightweight codec data. Invalid data type {}", + static_cast(data_type.value())); } } diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h index 39a00ef292d..f1f952fe68f 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.h @@ -44,9 +44,6 @@ class CompressionCodecLightweight : public ICompressionCodec UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; - bool isCompression() const override { return true; } // light compression - bool isGenericCompression() const override { return false; } - private: /// Integer data diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index 1de05fbec82..3c9c496f743 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -101,12 +101,10 @@ void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span(min_delta); mode = IntegerMode::CONSTANT_DELTA; @@ -138,6 +139,8 @@ void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span(max_value - min_value); + // additional T bytes for min_delta, and 1 byte for width + static constexpr auto ADDTIONAL_BYTES = sizeof(T) + sizeof(UInt8); size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + ADDTIONAL_BYTES; size_t estimate_lz_size = values.size() * sizeof(T) / ESRTIMATE_LZ4_COMPRESSION_RATIO; size_t rle_size = rle.empty() ? std::numeric_limits::max() : Compression::runLengthPairsSize(rle); @@ -232,6 +237,9 @@ size_t CompressionCodecLightweight::compressDataForInteger(const char * source, case IntegerMode::DELTA_FOR: { DeltaFORState delta_for_state = std::get<3>(state); + unalignedStore(dest, values[0]); + dest += sizeof(T); + compressed_size += sizeof(T); compressed_size += Compression::FOREncoding, true>( delta_for_state.deltas, delta_for_state.min_delta_value, diff --git a/dbms/src/IO/Compression/CompressionCodecMultiple.cpp b/dbms/src/IO/Compression/CompressionCodecMultiple.cpp index e39d175157d..d5577716b38 100644 --- a/dbms/src/IO/Compression/CompressionCodecMultiple.cpp +++ b/dbms/src/IO/Compression/CompressionCodecMultiple.cpp @@ -120,12 +120,4 @@ std::vector CompressionCodecMultiple::getCodecsBytesFromData(const char * return result; } -bool CompressionCodecMultiple::isCompression() const -{ - for (const auto & codec : codecs) - if (codec->isCompression()) - return true; - return false; -} - } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecMultiple.h b/dbms/src/IO/Compression/CompressionCodecMultiple.h index 9d8a0041265..784718567dd 100644 --- a/dbms/src/IO/Compression/CompressionCodecMultiple.h +++ b/dbms/src/IO/Compression/CompressionCodecMultiple.h @@ -44,9 +44,6 @@ class CompressionCodecMultiple final : public ICompressionCodec void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 decompressed_size) const override; - bool isCompression() const override; - bool isGenericCompression() const override { return false; } - private: Codecs codecs; }; diff --git a/dbms/src/IO/Compression/CompressionCodecNone.h b/dbms/src/IO/Compression/CompressionCodecNone.h index b5d9eaf83cc..8716ba00e43 100644 --- a/dbms/src/IO/Compression/CompressionCodecNone.h +++ b/dbms/src/IO/Compression/CompressionCodecNone.h @@ -33,9 +33,6 @@ class CompressionCodecNone final : public ICompressionCodec void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; - - bool isCompression() const override { return false; } - bool isGenericCompression() const override { return false; } }; } // namespace DB diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.h b/dbms/src/IO/Compression/CompressionCodecRunLength.h index 86a401765a0..d8237b5079d 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.h +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.h @@ -33,9 +33,6 @@ class CompressionCodecRunLength : public ICompressionCodec UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; - bool isCompression() const override { return false; } - bool isGenericCompression() const override { return false; } - private: template UInt32 compressDataForInteger(const char * source, UInt32 source_size, char * dest) const; diff --git a/dbms/src/IO/Compression/CompressionCodecZSTD.h b/dbms/src/IO/Compression/CompressionCodecZSTD.h index 5e180ba8847..77c97550022 100644 --- a/dbms/src/IO/Compression/CompressionCodecZSTD.h +++ b/dbms/src/IO/Compression/CompressionCodecZSTD.h @@ -34,9 +34,6 @@ class CompressionCodecZSTD : public ICompressionCodec void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; - bool isCompression() const override { return true; } - bool isGenericCompression() const override { return true; } - private: const int level; }; diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index aca78ee1784..5ce62863150 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -227,26 +227,48 @@ void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 template <> void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - const auto count = dest_size / sizeof(UInt32); - auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); + const auto deltas_count = dest_size / sizeof(UInt32) - 1; + if (unlikely(deltas_count == 0)) + { + memcpy(dest, src, sizeof(UInt32)); + return; + } + auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(deltas_count); // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt32); + const auto required_size = round_size * sizeof(UInt32) + sizeof(UInt32); char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); - FORDecoding(src, source_size, tmp_buffer, required_size); + // copy the first value to the temporary buffer + memcpy(tmp_buffer, src, sizeof(UInt32)); + FORDecoding( + src + sizeof(UInt32), + source_size - sizeof(UInt32), + tmp_buffer + sizeof(UInt32), + required_size - sizeof(UInt32)); deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } template <> void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - const auto count = dest_size / sizeof(UInt64); - const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); + const auto deltas_count = dest_size / sizeof(UInt64) - 1; + if (unlikely(deltas_count == 0)) + { + memcpy(dest, src, sizeof(UInt64)); + return; + } + const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(deltas_count); // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt64); + const auto required_size = round_size * sizeof(UInt64) + sizeof(UInt64); char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); - FORDecoding(src, source_size, tmp_buffer, required_size); + // copy the first value to the temporary buffer + memcpy(tmp_buffer, src, sizeof(UInt64)); + FORDecoding( + src + sizeof(UInt64), + source_size - sizeof(UInt64), + tmp_buffer + sizeof(UInt64), + required_size - sizeof(UInt64)); deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index 0d91ec188e7..b092a46111c 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -256,7 +256,12 @@ template void ordinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { using TS = typename std::make_signed_t; - FORDecoding(src, source_size, dest, dest_size); + // copy first value to dest + memcpy(dest, src, sizeof(T)); + if (unlikely(source_size <= sizeof(T))) + return; + // decode deltas + FORDecoding(src + sizeof(T), source_size - sizeof(T), dest + sizeof(T), dest_size - sizeof(T)); ordinaryDeltaDecoding(dest, dest_size, dest); } diff --git a/dbms/src/IO/Compression/ICompressionCodec.h b/dbms/src/IO/Compression/ICompressionCodec.h index 08b6585eef3..7542603539c 100644 --- a/dbms/src/IO/Compression/ICompressionCodec.h +++ b/dbms/src/IO/Compression/ICompressionCodec.h @@ -58,12 +58,6 @@ class ICompressionCodec : private boost::noncopyable /// Read method byte from compressed source static UInt8 readMethod(const char * source); - /// Return true if this codec actually compressing something. Otherwise it can be just transformation that helps compression (e.g. Delta). - virtual bool isCompression() const = 0; - - /// Is it a generic compression algorithm like lz4, zstd. Usually it does not make sense to apply generic compression more than single time. - virtual bool isGenericCompression() const = 0; - protected: /// Return size of compressed data without header virtual UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const { return uncompressed_size; } From 66ea1f62bb25238b2f2e83dfcda6156c48bc700b Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Fri, 5 Jul 2024 17:55:24 +0800 Subject: [PATCH 20/27] Update dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp --- dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index 3c9c496f743..8360a462955 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -71,7 +71,7 @@ bool CompressionCodecLightweight::IntegerCompressContext::needAnalyze() const return false; // if lz4 is used more than COUNT_THRESHOLD times and the compression ratio is better than lightweight codec, do not analyze anymore if (lz4_counter > COUNT_THRESHOLD - && lz4_uncompressed_size / lz4_compressed_size > lw_compressed_size / lw_uncompressed_size) + && lz4_uncompressed_size / lz4_compressed_size > lw_uncompressed_size / lw_compressed_size) return false; return true; } From f046e20be482044313888f642624c930c167b429 Mon Sep 17 00:00:00 2001 From: JaySon-Huang Date: Mon, 8 Jul 2024 11:43:21 +0800 Subject: [PATCH 21/27] Add comments --- dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp | 10 +++++----- dbms/src/IO/Compression/CompressionCodecFOR.cpp | 2 +- dbms/src/IO/Compression/CompressionInfo.h | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp index f3e13490683..de461e05090 100644 --- a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp @@ -122,7 +122,7 @@ void CompressionCodecDeltaFOR::doDecompressData( char * dest, UInt32 uncompressed_size) const { - if unlikely (source_size < 2) + if (unlikely(source_size < 2)) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress DeltaFor-encoded data. File has wrong header"); @@ -132,7 +132,7 @@ void CompressionCodecDeltaFOR::doDecompressData( UInt8 bytes_size = source[0]; auto data_type = magic_enum::enum_cast(bytes_size); - RUNTIME_CHECK(data_type.has_value()); + RUNTIME_CHECK(data_type.has_value(), bytes_size); UInt32 source_size_no_header = source_size - 1; switch (data_type.value()) @@ -162,7 +162,7 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( char * dest, UInt32 uncompressed_size) { - if unlikely (source_size < 2) + if (unlikely(source_size < 2)) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress DeltaFor-encoded data. File has wrong header"); @@ -172,9 +172,9 @@ void CompressionCodecDeltaFOR::ordinaryDecompress( UInt8 bytes_size = source[0]; auto data_type = magic_enum::enum_cast(bytes_size); - RUNTIME_CHECK(data_type.has_value()); + RUNTIME_CHECK(data_type.has_value(), bytes_size); - UInt32 source_size_no_header = source_size - 1; + const UInt32 source_size_no_header = source_size - 1; switch (data_type.value()) { case CompressionDataType::Int8: diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp index 3c3b380946c..72f97112ecd 100644 --- a/dbms/src/IO/Compression/CompressionCodecFOR.cpp +++ b/dbms/src/IO/Compression/CompressionCodecFOR.cpp @@ -111,7 +111,7 @@ void CompressionCodecFOR::doDecompressData( char * dest, UInt32 uncompressed_size) const { - if unlikely (source_size < 2) + if (unlikely(source_size < 2)) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress For-encoded data. File has wrong header"); if (unlikely(uncompressed_size == 0)) diff --git a/dbms/src/IO/Compression/CompressionInfo.h b/dbms/src/IO/Compression/CompressionInfo.h index f24e99741c6..2949f9ef1fe 100644 --- a/dbms/src/IO/Compression/CompressionInfo.h +++ b/dbms/src/IO/Compression/CompressionInfo.h @@ -68,10 +68,12 @@ enum class CompressionMethodByte : UInt8 enum class CompressionDataType : UInt8 { + // These enum values are used to represent the number of bytes of the type Int8 = 1, // Int8/UInt8 Int16 = 2, // Int16/UInt16 Int32 = 4, // Int32/UInt32 Int64 = 8, // Int64/UInt64 + // These enum values are not related to the number of bytes of the type Float32 = 9, Float64 = 10, String = 11, From b92c5eef4c5f4024108731435165a204797e0efa Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 8 Jul 2024 11:47:58 +0800 Subject: [PATCH 22/27] add comments Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/EncodingUtil.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index 5ce62863150..45b50ce0e1a 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -155,7 +155,11 @@ void deltaDecoding(const char * source, UInt32 source_size, char * dest) } #if defined(__AVX2__) -// Note: using SIMD to rewrite compress does not improve performance. + +/** + * 1. According to microbenchmark, the performance of SIMD encoding is not better than the ordinary implementation. + * 2. The SIMD implementation of UInt16 and UInt8 is too complex, and the performance is not better than the ordinary implementation. + */ template <> void deltaDecoding(const char * __restrict__ raw_source, UInt32 raw_source_size, char * __restrict__ raw_dest) From bfdbe765eca120fc2a6e8043ab8bc58c95c5dc7c Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 8 Jul 2024 12:51:58 +0800 Subject: [PATCH 23/27] fix Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/EncodingUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index b092a46111c..3bfd8d3b202 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -258,7 +258,7 @@ void ordinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, using TS = typename std::make_signed_t; // copy first value to dest memcpy(dest, src, sizeof(T)); - if (unlikely(source_size <= sizeof(T))) + if (unlikely(source_size == sizeof(T))) return; // decode deltas FORDecoding(src + sizeof(T), source_size - sizeof(T), dest + sizeof(T), dest_size - sizeof(T)); From 471481f0915d59725b4f40c0033d1e05951a7cba Mon Sep 17 00:00:00 2001 From: JaySon-Huang Date: Mon, 8 Jul 2024 13:49:50 +0800 Subject: [PATCH 24/27] Add sanitizer checks --- .../CompressionCodecLightweight_Integer.cpp | 2 +- .../Compression/CompressionCodecRunLength.cpp | 2 +- dbms/src/IO/Compression/EncodingUtil.cpp | 48 ++++++++++++------- dbms/src/IO/Compression/EncodingUtil.h | 31 ++++++++---- 4 files changed, 53 insertions(+), 30 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index 8360a462955..e4ba4dad00d 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -164,7 +164,7 @@ void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span::max() : Compression::runLengthPairsSize(rle); + size_t rle_size = rle.empty() ? std::numeric_limits::max() : Compression::runLengthPairsByteSize(rle); if (needAnalyzeRunLength() && rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size) { state = std::move(rle); diff --git a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp index 08dbd908436..0364147a6d8 100644 --- a/dbms/src/IO/Compression/CompressionCodecRunLength.cpp +++ b/dbms/src/IO/Compression/CompressionCodecRunLength.cpp @@ -71,7 +71,7 @@ UInt32 CompressionCodecRunLength::compressDataForInteger(const char * source, UI ++rle_vec.back().second; } - if (DB::Compression::runLengthPairsSize(rle_vec) >= source_size) + if (DB::Compression::runLengthPairsByteSize(rle_vec) >= source_size) { // treat as string dest[0] = magic_enum::enum_integer(CompressionDataType::String); diff --git a/dbms/src/IO/Compression/EncodingUtil.cpp b/dbms/src/IO/Compression/EncodingUtil.cpp index 45b50ce0e1a..29717c7d32b 100644 --- a/dbms/src/IO/Compression/EncodingUtil.cpp +++ b/dbms/src/IO/Compression/EncodingUtil.cpp @@ -122,6 +122,8 @@ template void subtractFrameOfReference(UInt64 *, UInt64, UInt32); template UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference) { + assert(!values.empty()); // caller must ensure input is not empty + if constexpr (std::is_signed_v) { // For signed types, after subtracting frame of reference, the range of values is not always [0, max_value - min_value]. @@ -228,55 +230,65 @@ void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 ordinaryDeltaFORDecoding(src, source_size, dest, dest_size); } +// For UInt8/UInt16, the default implement has better performance +template void deltaFORDecoding(const char *, UInt32, char *, UInt32); +template void deltaFORDecoding(const char *, UInt32, char *, UInt32); + template <> void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - const auto deltas_count = dest_size / sizeof(UInt32) - 1; + static constexpr auto TYPE_BYTE_SIZE = sizeof(UInt32); + assert(source_size >= TYPE_BYTE_SIZE); + assert(dest_size >= TYPE_BYTE_SIZE); + + const auto deltas_count = dest_size / TYPE_BYTE_SIZE - 1; if (unlikely(deltas_count == 0)) { - memcpy(dest, src, sizeof(UInt32)); + memcpy(dest, src, TYPE_BYTE_SIZE); return; } auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(deltas_count); // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt32) + sizeof(UInt32); + const auto required_size = round_size * TYPE_BYTE_SIZE + TYPE_BYTE_SIZE; char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); // copy the first value to the temporary buffer - memcpy(tmp_buffer, src, sizeof(UInt32)); + memcpy(tmp_buffer, src, TYPE_BYTE_SIZE); FORDecoding( - src + sizeof(UInt32), - source_size - sizeof(UInt32), - tmp_buffer + sizeof(UInt32), - required_size - sizeof(UInt32)); + src + TYPE_BYTE_SIZE, + source_size - TYPE_BYTE_SIZE, + tmp_buffer + TYPE_BYTE_SIZE, + required_size - TYPE_BYTE_SIZE); deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } template <> void deltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - const auto deltas_count = dest_size / sizeof(UInt64) - 1; + static constexpr auto TYPE_BYTE_SIZE = sizeof(UInt64); + assert(source_size >= TYPE_BYTE_SIZE); + assert(dest_size >= TYPE_BYTE_SIZE); + + const auto deltas_count = dest_size / TYPE_BYTE_SIZE - 1; if (unlikely(deltas_count == 0)) { - memcpy(dest, src, sizeof(UInt64)); + memcpy(dest, src, TYPE_BYTE_SIZE); return; } const auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(deltas_count); // Reserve enough space for the temporary buffer. - const auto required_size = round_size * sizeof(UInt64) + sizeof(UInt64); + const auto required_size = round_size * TYPE_BYTE_SIZE + TYPE_BYTE_SIZE; char tmp_buffer[required_size]; memset(tmp_buffer, 0, required_size); // copy the first value to the temporary buffer - memcpy(tmp_buffer, src, sizeof(UInt64)); + memcpy(tmp_buffer, src, TYPE_BYTE_SIZE); FORDecoding( - src + sizeof(UInt64), - source_size - sizeof(UInt64), - tmp_buffer + sizeof(UInt64), - required_size - sizeof(UInt64)); + src + TYPE_BYTE_SIZE, + source_size - TYPE_BYTE_SIZE, + tmp_buffer + TYPE_BYTE_SIZE, + required_size - TYPE_BYTE_SIZE); deltaDecoding(reinterpret_cast(tmp_buffer), dest_size, dest); } -template void deltaFORDecoding(const char *, UInt32, char *, UInt32); -template void deltaFORDecoding(const char *, UInt32, char *, UInt32); } // namespace DB::Compression diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index 3bfd8d3b202..4781c2d6c74 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -90,6 +90,7 @@ void constantDeltaDecoding(const char * src, UInt32 source_size, char * dest, UI /// Run-length encoding +// template using RunLengthPair = std::pair; template @@ -98,7 +99,7 @@ template static constexpr size_t RunLengthPairLength = sizeof(T) + sizeof(UInt8); template -size_t runLengthPairsSize(const RunLengthPairs & rle) +size_t runLengthPairsByteSize(const RunLengthPairs & rle) { return rle.size() * RunLengthPairLength; } @@ -134,7 +135,11 @@ void runLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 auto count = unalignedLoad(src); src += sizeof(UInt8); if (unlikely(dest + count * sizeof(T) > dest_end)) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot use RunLength decoding, data is too large"); + throw Exception( + ErrorCodes::CANNOT_DECOMPRESS, + "Cannot use RunLength decoding, data is too large, count={} elem_byte={}", + count, + sizeof(T)); if constexpr (std::is_same_v || std::is_same_v) { memset(dest, value, count); @@ -162,7 +167,8 @@ UInt8 FOREncodingWidth(std::vector & values, T frame_of_reference); template size_t FOREncoding(std::vector & values, T frame_of_reference, UInt8 width, char * dest) { - assert(!values.empty()); + assert(!values.empty()); // caller must ensure input is not empty + if constexpr (!skip_subtract_frame_of_reference) subtractFrameOfReference(values.data(), frame_of_reference, values.size()); // store frame of reference @@ -186,25 +192,26 @@ void applyFrameOfReference(T * dst, T frame_of_reference, UInt32 count); template void FORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { - UInt8 bytes_size = sizeof(T); - if unlikely (dest_size % bytes_size != 0) + static constexpr UInt8 BYTES_SIZE = sizeof(T); + if unlikely (dest_size % BYTES_SIZE != 0) throw Exception( ErrorCodes::CANNOT_DECOMPRESS, "uncompressed size {} is not aligned to {}", dest_size, - bytes_size); - const auto count = dest_size / sizeof(T); + BYTES_SIZE); + + const auto count = dest_size / BYTES_SIZE; T frame_of_reference = unalignedLoad(src); - src += sizeof(T); + src += BYTES_SIZE; auto width = unalignedLoad(src); src += sizeof(UInt8); - const auto required_size = source_size - sizeof(T) - sizeof(UInt8); + const auto required_size = source_size - BYTES_SIZE - sizeof(UInt8); RUNTIME_CHECK(BitpackingPrimitives::getRequiredSize(count, width) == required_size); auto round_size = BitpackingPrimitives::roundUpToAlgorithmGroupSize(count); if (round_size != count) { // Reserve enough space for the temporary buffer. - unsigned char tmp_buffer[round_size * sizeof(T)]; + unsigned char tmp_buffer[round_size * BYTES_SIZE]; BitpackingPrimitives::unPackBuffer(tmp_buffer, reinterpret_cast(src), count, width); applyFrameOfReference(reinterpret_cast(tmp_buffer), frame_of_reference, count); memcpy(dest, tmp_buffer, dest_size); @@ -255,6 +262,10 @@ void deltaDecoding(const char * source, UInt32 source_size, char * dest); template void ordinaryDeltaFORDecoding(const char * src, UInt32 source_size, char * dest, UInt32 dest_size) { + // caller should ensure these size + assert(source_size >= sizeof(T)); + assert(dest_size >= sizeof(T)); + using TS = typename std::make_signed_t; // copy first value to dest memcpy(dest, src, sizeof(T)); From c6ac15338637e1adce212b2a1d8b301f3378ee50 Mon Sep 17 00:00:00 2001 From: JaySon-Huang Date: Mon, 8 Jul 2024 13:54:54 +0800 Subject: [PATCH 25/27] Use UInt32 for looping --- dbms/src/IO/Compression/EncodingUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/Compression/EncodingUtil.h b/dbms/src/IO/Compression/EncodingUtil.h index 4781c2d6c74..24cad02ab5c 100644 --- a/dbms/src/IO/Compression/EncodingUtil.h +++ b/dbms/src/IO/Compression/EncodingUtil.h @@ -147,7 +147,7 @@ void runLengthDecoding(const char * src, UInt32 source_size, char * dest, UInt32 } else { - for (UInt8 j = 0; j < count; ++j) + for (UInt32 j = 0; j < count; ++j) { unalignedStore(dest, value); dest += sizeof(T); From d3d14d60b9bb67de75e024ddc7e0fd35642b2025 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger Date: Mon, 8 Jul 2024 15:30:39 +0800 Subject: [PATCH 26/27] assert source_size > 0 Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/ICompressionCodec.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/IO/Compression/ICompressionCodec.cpp b/dbms/src/IO/Compression/ICompressionCodec.cpp index 1e0d2cb94e2..393669045ab 100644 --- a/dbms/src/IO/Compression/ICompressionCodec.cpp +++ b/dbms/src/IO/Compression/ICompressionCodec.cpp @@ -30,6 +30,7 @@ extern const int CORRUPTED_DATA; UInt32 ICompressionCodec::compress(const char * source, UInt32 source_size, char * dest) const { assert(source != nullptr && dest != nullptr); + assert(source_size > 0); dest[0] = getMethodByte(); UInt8 header_size = getHeaderSize(); From 7b6f51d0b90e053cb92d6c5fd29825e36b5a301e Mon Sep 17 00:00:00 2001 From: JaySon-Huang Date: Mon, 8 Jul 2024 15:50:46 +0800 Subject: [PATCH 27/27] Add comments --- dbms/src/IO/Compression/CompressionCodecLightweight.h | 3 +++ .../IO/Compression/CompressionCodecLightweight_Integer.cpp | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h index f1f952fe68f..bc092d06e27 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight.h +++ b/dbms/src/IO/Compression/CompressionCodecLightweight.h @@ -27,6 +27,9 @@ namespace DB * For integer data, it supports constant, constant delta, run-length, frame of reference, delta frame of reference, and LZ4. * For non-integer data, it supports LZ4. * The codec selects the best mode for each block of data. + * + * Note that this codec instance contains `ctx` for choosing the best compression + * mode for each block. Do NOT reuse the same instance for encoding data among multi-threads. */ class CompressionCodecLightweight : public ICompressionCodec { diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp index e4ba4dad00d..6ce2b51bbcc 100644 --- a/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp +++ b/dbms/src/IO/Compression/CompressionCodecLightweight_Integer.cpp @@ -121,8 +121,9 @@ void CompressionCodecLightweight::IntegerCompressContext::analyze(std::span 1); deltas.reserve(values.size() - 1); for (size_t i = 1; i < values.size(); ++i) {