Skip to content

Commit

Permalink
avoid analyze delta every time
Browse files Browse the repository at this point in the history
Signed-off-by: Lloyd-Pottiger <yan1579196623@gmail.com>
  • Loading branch information
Lloyd-Pottiger committed May 10, 2024
1 parent 7da6874 commit d88a3f7
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 26 deletions.
69 changes: 43 additions & 26 deletions dbms/src/IO/Compression/CompressionCodecIntegerLightweight.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ UInt8 CompressionCodecIntegerLightweight::getMethodByte() const
UInt32 CompressionCodecIntegerLightweight::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
// 1 byte for bytes_size, 1 byte for mode, and the rest for compressed data
return 1 + 1 + uncompressed_size;
return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size);
}

template <typename T>
Expand Down Expand Up @@ -191,6 +191,10 @@ void CompressionCodecIntegerLightweight::CompressContext::update(size_t uncompre
lw_compressed_size += compressed_size;
++lw_counter;
}
if (mode == Mode::CONSTANT_DELTA)
++constant_delta_counter;
if (mode == Mode::DELTA_FOR)
++delta_for_counter;
}

bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const
Expand All @@ -204,18 +208,23 @@ bool CompressionCodecIntegerLightweight::CompressContext::needAnalyze() const
return true;
}

bool CompressionCodecIntegerLightweight::CompressContext::needAnalyzeDelta() const
{
return lw_counter <= 5 || constant_delta_counter != 0 || delta_for_counter != 0;
}

template <typename T>
void CompressionCodecIntegerLightweight::CompressContext::analyze(std::vector<T> & values, State<T> & state)
{
if (!needAnalyze())
return;

if (values.empty())
{
mode = Mode::Invalid;
return;
}

if (!needAnalyze())
return;

// Check CONSTANT
std::vector<std::pair<T, UInt8>> rle;
rle.reserve(values.size());
Expand All @@ -236,46 +245,54 @@ void CompressionCodecIntegerLightweight::CompressContext::analyze(std::vector<T>
return;
}

// Check CONSTANT_DELTA
using TS = std::make_signed_t<T>;
std::vector<TS> deltas;
deltas.reserve(values.size());
deltas.push_back(values[0]);
for (size_t i = 1; i < values.size(); ++i)
UInt8 delta_for_width = sizeof(T) * 8;
size_t delta_for_size = std::numeric_limits<size_t>::max();
TS min_delta = std::numeric_limits<TS>::min();
if (needAnalyzeDelta())
{
deltas.push_back(values[i] - values[i - 1]);
}
TS min_delta = *std::min_element(deltas.cbegin(), deltas.cend());
TS max_delta = *std::max_element(deltas.cbegin(), deltas.cend());
if (min_delta == max_delta)
{
state = static_cast<T>(min_delta);
mode = Mode::CONSTANT_DELTA;
return;
// Check CONSTANT_DELTA
deltas.reserve(values.size());
deltas.push_back(values[0]);
for (size_t i = 1; i < values.size(); ++i)
{
deltas.push_back(values[i] - values[i - 1]);
}
min_delta = *std::min_element(deltas.cbegin(), deltas.cend());
if (min_delta == *std::max_element(deltas.cbegin(), deltas.cend()))
{
state = static_cast<T>(min_delta);
mode = Mode::CONSTANT_DELTA;
return;
}

delta_for_width = Compression::FOREncodingWidth(deltas, min_delta);
// additional T bytes for min_delta, and 1 byte for width
delta_for_size
= BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8);
}

UInt8 delta_for_width = Compression::FOREncodingWidth(deltas, min_delta);
// additional T bytes for min_delta, and 1 byte for width
size_t delta_for_size
= BitpackingPrimitives::getRequiredSize(deltas.size(), delta_for_width) + sizeof(T) + sizeof(UInt8);
UInt8 for_width = BitpackingPrimitives::minimumBitWidth<T>(max_value - min_value);
// additional T bytes for min_value, and 1 byte for width
size_t for_size = BitpackingPrimitives::getRequiredSize(values.size(), for_width) + sizeof(T) + sizeof(UInt8);
size_t origin_size = values.size() * sizeof(T);
// Assume that the compression ratio of LZ4 is 3.0
// The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4
size_t estimate_lz_size = values.size() * sizeof(T) / 3;
size_t rle_size = Compression::RLEPairsSize(rle);
if (rle_size < delta_for_size && rle_size < for_size && rle_size < origin_size)
if (rle_size < delta_for_size && rle_size < for_size && rle_size < estimate_lz_size)
{
state = std::move(rle);
mode = Mode::RLE;
}
else if (for_size < delta_for_size && for_size < origin_size)
else if (for_size < delta_for_size && for_size < estimate_lz_size)
{
state = FORState<T>{min_value, for_width};
mode = Mode::FOR;
}
else if (delta_for_size < origin_size)
else if (delta_for_size < estimate_lz_size)
{
state = DeltaFORState<T>{deltas, min_delta, delta_for_width};
state = DeltaFORState<T>{std::move(deltas), min_delta, delta_for_width};
mode = Mode::DELTA_FOR;
}
else
Expand Down
4 changes: 4 additions & 0 deletions dbms/src/IO/Compression/CompressionCodecIntegerLightweight.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec

bool needAnalyze() const;

bool needAnalyzeDelta() const;

template <typename T>
void analyze(std::vector<T> & values, State<T> & state);

Expand All @@ -96,6 +98,8 @@ class CompressionCodecIntegerLightweight : public ICompressionCodec
size_t lz4_uncompressed_size = 0;
size_t lz4_compressed_size = 0;
size_t lz4_counter = 0;
size_t constant_delta_counter = 0;
size_t delta_for_counter = 0;
};

template <typename T>
Expand Down

0 comments on commit d88a3f7

Please sign in to comment.