pingcap · Lloyd-Pottiger · Jul 8, 2024 · Jul 9, 2024 · Jul 16, 2024 · Jul 17, 2024
diff --git a/.clangd b/.clangd
@@ -1,2 +1,4 @@
 CompileFlags:
- Add: -ferror-limit=0
+ Add:
+ - -ferror-limit=0
+ - -Wno-vla-cxx-extension
diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h
@@ -867,7 +867,24 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
  M(tiflash_read_thread_internal_us, \
  "Durations of read thread internal components", \
  Histogram, \
- F(type_block_queue_pop_latency, {{"type", "block_queue_pop_latency"}}, ExpBuckets{1, 2, 20}))
+ F(type_block_queue_pop_latency, {{"type", "block_queue_pop_latency"}}, ExpBuckets{1, 2, 20})) \
+ M(tiflash_storage_pack_compression_algorithm_count, \
+ "The count of the compression algorithm used by each data part", \
+ Counter, \
+ F(type_constant, {"type", "constant"}), \
+ F(type_constant_delta, {"type", "constant_delta"}), \
+ F(type_runlength, {"type", "runlength"}), \
+ F(type_for, {"type", "for"}), \
+ F(type_delta_for, {"type", "delta_for"}), \
+ F(type_lz4, {"type", "lz4"}), \
+ F(type_delta_lz4, {"type", "delta_lz4"})) \
+ M(tiflash_storage_pack_compression_bytes, \
+ "The uncompression/compression bytes of lz4 and lightweight", \
+ Counter, \
+ F(type_lz4_compressed_bytes, {"type", "lz4_compressed_bytes"}), \
+ F(type_lz4_uncompressed_bytes, {"type", "lz4_uncompressed_bytes"}), \
+ F(type_lightweight_compressed_bytes, {"type", "lightweight_compressed_bytes"}), \
+ F(type_lightweight_uncompressed_bytes, {"type", "lightweight_uncompressed_bytes"}))
 
 
 /// Buckets with boundaries [start * base^0, start * base^1, ..., start * base^(size-1)]

diff --git a/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp b/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp
@@ -20,7 +20,6 @@
 #include <DataStreams/SortHelper.h>
 #include <DataStreams/copyData.h>
 #include <IO/Buffer/WriteBufferFromFile.h>
-#include <IO/Compression/CompressedWriteBuffer.h>
 #include <common/logger_useful.h>
 
 namespace DB

diff --git a/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h b/dbms/src/IO/Compression/CompressionCodecDeflateQpl.h
@@ -103,6 +103,8 @@ class CompressionCodecDeflateQpl final : public ICompressionCodec
  UInt8 getMethodByte() const override;
  bool isCompression() const override { return true; }
 
+ bool isCompression() const override { return true; }
+
 protected:
  UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
  void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size)

diff --git a/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp b/dbms/src/IO/Compression/CompressionCodecDeltaFOR.cpp
@@ -17,7 +17,6 @@
 #include <IO/Compression/CompressionCodecDeltaFOR.h>
 #include <IO/Compression/CompressionCodecFOR.h>
 #include <IO/Compression/CompressionInfo.h>
-#include <IO/Compression/CompressionSettings.h>
 #include <IO/Compression/EncodingUtil.h>
 #include <common/likely.h>
 
@@ -60,15 +59,13 @@ UInt32 compressData(const char * source, UInt32 source_size, char * dest)
  if unlikely (source_size % bytes_size != 0)
  throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "source size {} is not aligned to {}", source_size, bytes_size);
  const auto count = source_size / bytes_size;
- DB::Compression::deltaEncoding<T>(reinterpret_cast<const T *>(source), count, reinterpret_cast<T *>(dest));
  if (unlikely(count == 1))
  return bytes_size;
- // Cast deltas to signed type to better compress negative values.
- // For example, if we have a sequence of UInt8 values [3, 2, 1, 0], the deltas will be [3, -1, -1, -1]
- // If we compress them as UInt8, we will get [3, 255, 255, 255], which is not optimal.
  using TS = typename std::make_signed<T>::type;
- auto for_size = DB::CompressionCodecFOR::compressData<TS>(
- reinterpret_cast<TS *>(dest + bytes_size),
+ // view source as signed integers so that delta will be smaller
+ DB::Compression::deltaEncoding<TS>(reinterpret_cast<const TS *>(source), count, reinterpret_cast<TS *>(dest));
+ auto for_size = DB::CompressionCodecFOR::compressData<T>(
+ reinterpret_cast<T *>(dest + bytes_size),
  source_size - bytes_size,
  dest + bytes_size);
  return bytes_size + for_size;

diff --git a/dbms/src/IO/Compression/CompressionCodecFOR.cpp b/dbms/src/IO/Compression/CompressionCodecFOR.cpp
@@ -16,7 +16,6 @@
 #include <Common/Exception.h>
 #include <IO/Compression/CompressionCodecFOR.h>
 #include <IO/Compression/CompressionInfo.h>
-#include <IO/Compression/CompressionSettings.h>
 #include <IO/Compression/EncodingUtil.h>
 #include <common/likely.h>
 
@@ -60,9 +59,11 @@ UInt32 CompressionCodecFOR::compressData(const T * source, UInt32 source_size, c
  if unlikely (count == 0)
  throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress empty data");
  std::vector<T> values(source, source + count);
- T frame_of_reference = *std::min_element(values.cbegin(), values.cend());
- UInt8 width = DB::Compression::FOREncodingWidth(values, frame_of_reference);
- return DB::Compression::FOREncoding<T, std::is_signed_v<T>>(values, frame_of_reference, width, dest);
+ auto minmax = std::minmax_element(values.cbegin(), values.cend());
+ T frame_of_reference = *minmax.first;
+ T max_value = *minmax.second;
+ UInt8 width = BitpackingPrimitives::minimumBitWidth<T>(max_value - frame_of_reference);
+ return DB::Compression::FOREncoding(values.data(), values.size(), frame_of_reference, width, dest);
 }
 
 UInt32 CompressionCodecFOR::doCompressData(const char * source, UInt32 source_size, char * dest) const

diff --git a/dbms/src/IO/Compression/CompressionCodecFactory.cpp b/dbms/src/IO/Compression/CompressionCodecFactory.cpp
@@ -77,7 +77,6 @@ template CompressionCodecPtr CompressionCodecFactory::getStaticCodec<Compression
 template CompressionCodecPtr CompressionCodecFactory::getStaticCodec<CompressionCodecRunLength>(
  const CompressionSetting & setting);
 
-
 template <>
 CompressionCodecPtr CompressionCodecFactory::getStaticCodec<CompressionCodecLZ4>(const CompressionSetting & setting)
 {
@@ -152,7 +151,6 @@ CompressionCodecPtr CompressionCodecFactory::getStaticCodec<CompressionCodecDefl
 }
 #endif
 
-
 template <bool IS_COMPRESS>
 CompressionCodecPtr CompressionCodecFactory::create(const CompressionSetting & setting)
 {
@@ -190,7 +188,7 @@ CompressionCodecPtr CompressionCodecFactory::create(const CompressionSetting & s
  switch (setting.method_byte)
  {
  case CompressionMethodByte::Lightweight:
- return std::make_unique<CompressionCodecLightweight>(setting.data_type);
+ return std::make_unique<CompressionCodecLightweight>(setting.data_type, setting.level);
  case CompressionMethodByte::DeltaFOR:
  return getStaticCodec<CompressionCodecDeltaFOR>(setting);
  case CompressionMethodByte::RunLength:

diff --git a/dbms/src/IO/Compression/CompressionCodecLZ4.cpp b/dbms/src/IO/Compression/CompressionCodecLZ4.cpp
@@ -72,10 +72,4 @@ CompressionCodecLZ4HC::CompressionCodecLZ4HC(int level_)
  : CompressionCodecLZ4(level_)
 {}
 
-
-CompressionCodecPtr getCompressionCodecLZ4(int level)
-{
- return std::make_unique<CompressionCodecLZ4HC>(level);
-}
-
 } // namespace DB
diff --git a/dbms/src/IO/Compression/CompressionCodecLZ4.h b/dbms/src/IO/Compression/CompressionCodecLZ4.h
@@ -19,9 +19,14 @@
 namespace DB
 {
 
+class CompressionCodecFactory;
+
 class CompressionCodecLZ4 : public ICompressionCodec
 {
 public:
+ // The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4
+ static constexpr size_t ESTIMATE_INTEGER_COMPRESSION_RATIO = 4;
+
  explicit CompressionCodecLZ4(int level_);
 
  UInt8 getMethodByte() const override;
@@ -39,6 +44,7 @@ class CompressionCodecLZ4 : public ICompressionCodec
 
 protected:
  const int level;
+ friend class CompressionCodecFactory;
 };
 
 

diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.cpp b/dbms/src/IO/Compression/CompressionCodecLightweight.cpp
@@ -23,16 +23,16 @@
 namespace DB
 {
 
-// TODO: metrics
 
 namespace ErrorCodes
 {
 extern const int CANNOT_COMPRESS;
 extern const int CANNOT_DECOMPRESS;
 } // namespace ErrorCodes
 
-CompressionCodecLightweight::CompressionCodecLightweight(CompressionDataType data_type_)
- : data_type(data_type_)
+CompressionCodecLightweight::CompressionCodecLightweight(CompressionDataType data_type_, int level_)
+ : ctx(level_)
+ , data_type(data_type_)
 {}
 
 UInt8 CompressionCodecLightweight::getMethodByte() const
@@ -46,12 +46,6 @@ UInt32 CompressionCodecLightweight::getMaxCompressedDataSize(UInt32 uncompressed
  return 1 + 1 + LZ4_COMPRESSBOUND(uncompressed_size);
 }
 
-CompressionCodecLightweight::~CompressionCodecLightweight()
-{
- if (ctx.isCompression())
- LOG_INFO(Logger::get(), "lightweight codec: {}", ctx.toDebugString());
-}
-
 UInt32 CompressionCodecLightweight::doCompressData(const char * source, UInt32 source_size, char * dest) const
 {
  dest[0] = magic_enum::enum_integer(data_type);

diff --git a/dbms/src/IO/Compression/CompressionCodecLightweight.h b/dbms/src/IO/Compression/CompressionCodecLightweight.h
@@ -34,13 +34,13 @@ namespace DB
 class CompressionCodecLightweight : public ICompressionCodec
 {
 public:
- explicit CompressionCodecLightweight(CompressionDataType data_type_);
+ explicit CompressionCodecLightweight(CompressionDataType data_type_, int level_);
 
  UInt8 getMethodByte() const override;
 
- bool isCompression() const override { return true; }
+ ~CompressionCodecLightweight() override = default;
 
- ~CompressionCodecLightweight() override;
+ bool isCompression() const override { return true; }
 
 protected:
  UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
@@ -55,86 +55,88 @@ class CompressionCodecLightweight : public ICompressionCodec
  enum class IntegerMode : UInt8
  {
  Invalid = 0,
- CONSTANT = 1, // all values are the same
- CONSTANT_DELTA = 2, // the difference between two adjacent values is the same
- RunLength = 3, // run-length encoding
+ Constant = 1, // all values are the same
+ ConstantDelta = 2, // the difference between two adjacent values is the same
+ RunLength = 3, // the same value appears multiple times
  FOR = 4, // Frame of Reference encoding
- DELTA_FOR = 5, // delta encoding and then FOR encoding
+ DeltaFOR = 5, // delta encoding and then FOR encoding
  LZ4 = 6, // the above modes are not suitable, use LZ4 instead
+ DeltaLZ4 = 7, // delta encoding and then LZ4
  };
 
  // Constant or ConstantDelta
- template <typename T>
+ template <std::integral T>
  using ConstantState = T;
 
- template <typename T>
- using RunLengthState = std::vector<std::pair<T, UInt8>>;
-
- template <typename T>
+ template <std::integral T>
  struct FORState
  {
  std::vector<T> values;
  T min_value;
  UInt8 bit_width;
  };
 
- template <typename T>
+ template <std::integral T>
  struct DeltaFORState
  {
  using TS = typename std::make_signed_t<T>;
- std::vector<TS> deltas;
+ std::vector<T> deltas;
  TS min_delta_value;
  UInt8 bit_width;
  };
 
+ template <std::integral T>
+ struct DeltaLZ4State
+ {
+ std::vector<T> deltas;
+ };
+
  // State is a union of different states for different modes
- template <typename T>
- using IntegerState = std::variant<ConstantState<T>, RunLengthState<T>, FORState<T>, DeltaFORState<T>>;
+ template <std::integral T>
+ using IntegerState = std::variant<ConstantState<T>, FORState<T>, DeltaFORState<T>, DeltaLZ4State<T>>;
 
  class IntegerCompressContext
  {
  public:
- IntegerCompressContext() = default;
+ explicit IntegerCompressContext(int round_count_)
+ : round_count(round_count_)
+ {}
 
- template <typename T>
+ template <std::integral T>
  void analyze(std::span<const T> & values, IntegerState<T> & state);
 
  void update(size_t uncompressed_size, size_t compressed_size);
 
- String toDebugString() const;
- bool isCompression() const { return lz4_counter > 0 || lw_counter > 0; }
-
  IntegerMode mode = IntegerMode::LZ4;
 
  private:
  bool needAnalyze() const;
+
+ template <std::integral T>
  bool needAnalyzeDelta() const;
+
+ template <std::integral T>
+ static constexpr bool needAnalyzeFOR();
+
  bool needAnalyzeRunLength() const;
 
+ void resetIfNeed();
+
  private:
- // The threshold for the number of blocks to decide whether need to analyze.
- // For example:
- // If lz4 is used more than COUNT_THRESHOLD times and the compression ratio is better than lightweight codec, do not analyze anymore.
- static constexpr size_t COUNT_THRESHOLD = 5;
- // Assume that the compression ratio of LZ4 is 3.0
- // The official document says that the compression ratio of LZ4 is 2.1, https://github.com/lz4/lz4
- static constexpr size_t ESRTIMATE_LZ4_COMPRESSION_RATIO = 3;
-
- size_t lw_uncompressed_size = 0;
- size_t lw_compressed_size = 0;
- size_t lw_counter = 0;
- size_t lz4_uncompressed_size = 0;
- size_t lz4_compressed_size = 0;
- size_t lz4_counter = 0;
- size_t constant_delta_counter = 0;
- size_t delta_for_counter = 0;
- size_t rle_counter = 0;
+ // Every round_count blocks as a round, decide whether to analyze the mode.
+ const int round_count;
+ int compress_count = 0;
+ bool used_lz4 = false;
+ bool used_constant_delta = false;
+ bool used_delta_for = false;
+ bool used_rle = false;
+ bool used_delta_lz4 = false;
  };
 
- template <typename T>
+ template <std::integral T>
  size_t compressDataForInteger(const char * source, UInt32 source_size, char * dest) const;
 
- template <typename T>
+ template <std::integral T>
  void decompressDataForInteger(const char * source, UInt32 source_size, char * dest, UInt32 output_size) const;
 
  /// Non-integer data