Skip to content

Commit

Permalink
Optimize Aggregator/Join/Set keys (#6135)
Browse files Browse the repository at this point in the history
ref #5294
  • Loading branch information
solotzg authored Oct 29, 2022
1 parent 0db359a commit 1a50e00
Show file tree
Hide file tree
Showing 17 changed files with 1,283 additions and 370 deletions.
59 changes: 31 additions & 28 deletions dbms/src/Columns/ColumnString.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <Common/PODArray.h>
#include <Common/SipHash.h>
#include <Common/memcpySmall.h>
#include <string.h>
#include <common/memcpy.h>


class ICollator;
Expand Down Expand Up @@ -119,13 +119,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
void insert(const Field & x) override
{
const auto & s = DB::get<const String &>(x);
const size_t old_size = chars.size();
const size_t size_to_append = s.size() + 1;
const size_t new_size = old_size + size_to_append;

chars.resize(new_size);
memcpy(&chars[old_size], s.c_str(), size_to_append);
offsets.push_back(new_size);
insertData(s.data(), s.size());
}

#if !__clang__
Expand Down Expand Up @@ -169,17 +163,25 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
}
}

void insertData(const char * pos, size_t length) override
template <bool add_terminating_zero>
ALWAYS_INLINE inline void insertDataImpl(const char * pos, size_t length)
{
const size_t old_size = chars.size();
const size_t new_size = old_size + length + 1;
const size_t new_size = old_size + length + (add_terminating_zero ? 1 : 0);

chars.resize(new_size);
memcpy(&chars[old_size], pos, length);
chars[old_size + length] = 0;
inline_memcpy(&chars[old_size], pos, length);

if constexpr (add_terminating_zero)
chars[old_size + length] = 0;
offsets.push_back(new_size);
}

void insertData(const char * pos, size_t length) override
{
return insertDataImpl<true>(pos, length);
}

bool decodeTiDBRowV2Datum(size_t cursor, const String & raw_value, size_t length, bool /* force_decode */) override
{
insertData(raw_value.c_str() + cursor, length);
Expand All @@ -188,12 +190,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>

void insertDataWithTerminatingZero(const char * pos, size_t length) override
{
const size_t old_size = chars.size();
const size_t new_size = old_size + length;

chars.resize(new_size);
memcpy(&chars[old_size], pos, length);
offsets.push_back(new_size);
return insertDataImpl<false>(pos, length);
}

void popBack(size_t n) override
Expand All @@ -220,24 +217,30 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
}
res.size = sizeof(string_size) + string_size;
char * pos = arena.allocContinue(res.size, begin);
memcpy(pos, &string_size, sizeof(string_size));
memcpy(pos + sizeof(string_size), src, string_size);
std::memcpy(pos, &string_size, sizeof(string_size));
inline_memcpy(pos + sizeof(string_size), src, string_size);
res.data = pos;
return res;
}

const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr &) override
inline const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr & collator) override
{
const size_t string_size = *reinterpret_cast<const size_t *>(pos);
pos += sizeof(string_size);

const size_t old_size = chars.size();
const size_t new_size = old_size + string_size;
chars.resize(new_size);
memcpy(&chars[old_size], pos, string_size);

offsets.push_back(new_size);
return pos + string_size;
if (likely(collator))
{
// https://github.com/pingcap/tiflash/pull/6135
// - Generate empty string column
// - Make size of `offsets` as previous way for func `ColumnString::size()`
offsets.push_back(0);
return pos + string_size;
}
else
{
insertDataWithTerminatingZero(pos, string_size);
return pos + string_size;
}
}

void updateHashWithValue(size_t n, SipHash & hash, const TiDB::TiDBCollatorPtr & collator, String & sort_key_container) const override
Expand Down
7 changes: 0 additions & 7 deletions dbms/src/Columns/ColumnVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,6 @@ StringRef ColumnVector<T>::serializeValueIntoArena(size_t n, Arena & arena, char
return StringRef(pos, sizeof(T));
}

template <typename T>
const char * ColumnVector<T>::deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr &)
{
data.push_back(*reinterpret_cast<const T *>(pos));
return pos + sizeof(T);
}

template <typename T>
void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash, const TiDB::TiDBCollatorPtr &, String &) const
{
Expand Down
6 changes: 5 additions & 1 deletion dbms/src/Columns/ColumnVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,11 @@ class ColumnVector final : public COWPtrHelper<ColumnVectorHelper, ColumnVector<

StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const TiDB::TiDBCollatorPtr &, String &) const override;

const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr &) override;
inline const char * deserializeAndInsertFromArena(const char * pos, const TiDB::TiDBCollatorPtr &) override
{
data.push_back(*reinterpret_cast<const T *>(pos));
return pos + sizeof(T);
}

void updateHashWithValue(size_t n, SipHash & hash, const TiDB::TiDBCollatorPtr &, String &) const override;
void updateHashWithValues(IColumn::HashValues & hash_values, const TiDB::TiDBCollatorPtr &, String &) const override;
Expand Down
154 changes: 150 additions & 4 deletions dbms/src/Common/ColumnsHashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include <common/memcpy.h>
#include <common/unaligned.h>

#include <memory>

namespace DB
{
Expand Down Expand Up @@ -114,12 +113,13 @@ struct HashMethodString
ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector<String> & sort_key_containers) const
{
auto last_offset = row == 0 ? 0 : offsets[row - 1];
// Remove last zero byte.
StringRef key(chars + last_offset, offsets[row] - last_offset - 1);

if constexpr (place_string_to_arena)
{
if (likely(collator))
key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]);
key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
return ArenaKeyHolder{key, *pool};
}
else
Expand All @@ -132,6 +132,37 @@ struct HashMethodString
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
};

template <typename Value, typename Mapped, bool padding>
struct HashMethodStringBin
: public columns_hashing_impl::HashMethodBase<HashMethodStringBin<Value, Mapped, padding>, Value, Mapped, false>
{
using Self = HashMethodStringBin<Value, Mapped, padding>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;

const IColumn::Offset * offsets;
const UInt8 * chars;

HashMethodStringBin(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators &)
{
const IColumn & column = *key_columns[0];
const auto & column_string = assert_cast<const ColumnString &>(column);
offsets = column_string.getOffsets().data();
chars = column_string.getChars().data();
}

ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
{
auto last_offset = row == 0 ? 0 : offsets[row - 1];
StringRef key(chars + last_offset, offsets[row] - last_offset - 1);
key = BinCollatorSortKey<padding>(key.data, key.size);
return ArenaKeyHolder{key, *pool};
}

protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
};

/*
/// For the case when there is multi string key.
template <typename Value, typename Mapped>
struct HashMethodMultiString
Expand Down Expand Up @@ -172,8 +203,6 @@ struct HashMethodMultiString
{
auto num = offsets.size();
static_assert(std::is_same_v<size_t, decltype(reinterpret_cast<const StringRef *>(0)->size)>);

const char * begin = nullptr;
size_t sum_size = 0;
Expand Down Expand Up @@ -223,6 +252,123 @@ struct HashMethodMultiString
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
};
*/

static_assert(std::is_same_v<size_t, decltype(reinterpret_cast<const StringRef *>(0)->size)>);

struct KeyDescNumber64
{
using ColumnType = ColumnUInt64;
using AllocSize = size_t;
static constexpr size_t ElementSize = sizeof(ColumnType::value_type);

explicit KeyDescNumber64(const IColumn * key_column_)
{
column = static_cast<const ColumnType *>(key_column_);
}
static inline void serializeKey(char *& pos, const StringRef & ref)
{
std::memcpy(pos, ref.data, ElementSize);
pos += ElementSize;
}
ALWAYS_INLINE inline AllocSize getKey(ssize_t row, StringRef & ref) const
{
const auto & element = column->getElement(row);
ref = {reinterpret_cast<const char *>(&element), ElementSize};
return ElementSize;
}
const ColumnType * column{};
};

struct KeyDescStringBin
{
using ColumnType = ColumnString;
using AllocSize = size_t;

explicit KeyDescStringBin(const IColumn * key_column_)
{
column = static_cast<const ColumnType *>(key_column_);
}
static inline void serializeKey(char *& pos, const StringRef & ref)
{
std::memcpy(pos, &ref.size, sizeof(ref.size));
pos += sizeof(ref.size);
inline_memcpy(pos, ref.data, ref.size);
pos += ref.size;
}

template <typename F>
ALWAYS_INLINE inline AllocSize getKeyImpl(ssize_t row, StringRef & key, F && fn_handle_key) const
{
const auto * offsets = column->getOffsets().data();
const auto * chars = column->getChars().data();

size_t last_offset = 0;
if (likely(row != 0))
last_offset = offsets[row - 1];

key = {chars + last_offset, offsets[row] - last_offset - 1};
key = fn_handle_key(key);

return key.size + sizeof(key.size);
}

ALWAYS_INLINE inline AllocSize getKey(ssize_t row, StringRef & ref) const
{
return getKeyImpl(row, ref, [](StringRef key) {
return key;
});
}

const ColumnType * column{};
};

struct KeyDescStringBinPadding : KeyDescStringBin
{
explicit KeyDescStringBinPadding(const IColumn * key_column_)
: KeyDescStringBin(key_column_)
{}

ALWAYS_INLINE inline AllocSize getKey(ssize_t row, StringRef & ref) const
{
return getKeyImpl(row, ref, [](StringRef key) {
return DB::BinCollatorSortKey<true>(key.data, key.size);
});
}
};

/// For the case when there are 2 keys.
template <typename Key1Desc, typename Key2Desc, typename Value, typename Mapped>
struct HashMethodFastPathTwoKeysSerialized
: public columns_hashing_impl::HashMethodBase<HashMethodFastPathTwoKeysSerialized<Key1Desc, Key2Desc, Value, Mapped>, Value, Mapped, false>
{
using Self = HashMethodFastPathTwoKeysSerialized<Key1Desc, Key2Desc, Value, Mapped>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;

Key1Desc key_1_desc;
Key2Desc key_2_desc;

HashMethodFastPathTwoKeysSerialized(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators &)
: key_1_desc(key_columns[0])
, key_2_desc(key_columns[1])
{
}

ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> &) const
{
StringRef key1;
StringRef key2;
size_t alloc_size = key_1_desc.getKey(row, key1) + key_2_desc.getKey(row, key2);
char * start = pool->alloc(alloc_size);
SerializedKeyHolder ret{{start, alloc_size}, *pool};
Key1Desc::serializeKey(start, key1);
Key2Desc::serializeKey(start, key2);
return ret;
}

protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
};


/// For the case when there is one fixed-length string key.
Expand Down
8 changes: 7 additions & 1 deletion dbms/src/Common/HashTable/StringHashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,13 @@ class StringHashTable : private boost::noncopyable
// 3. Funcs are named callables that can be force_inlined
// NOTE: It relies on Little Endianness
template <typename Self, typename KeyHolder, typename Func>
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
static auto
#if defined(ADDRESS_SANITIZER)
NO_INLINE NO_SANITIZE_ADDRESS
#else
ALWAYS_INLINE
#endif
dispatch(Self & self, KeyHolder && key_holder, Func && func)
{
StringHashTableHash hash;
const StringRef & x = keyHolderGetKey(key_holder);
Expand Down
8 changes: 7 additions & 1 deletion dbms/src/Common/HashTable/TwoLevelStringHashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,13 @@ class TwoLevelStringHashTable : private boost::noncopyable
// This function is mostly the same as StringHashTable::dispatch, but with
// added bucket computation. See the comments there.
template <typename Self, typename Func, typename KeyHolder>
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
static auto
#if defined(ADDRESS_SANITIZER)
NO_INLINE NO_SANITIZE_ADDRESS
#else
ALWAYS_INLINE
#endif
dispatch(Self & self, KeyHolder && key_holder, Func && func)
{
StringHashTableHash hash;
const StringRef & x = keyHolderGetKey(key_holder);
Expand Down
8 changes: 4 additions & 4 deletions dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1371,10 +1371,10 @@ void DAGExpressionAnalyzer::makeExplicitSet(
set_element_types.push_back(sample_block.getByName(left_arg_name).type);

// todo if this is a single value in, then convert it to equal expr
SetPtr set = std::make_shared<Set>(SizeLimits(settings.max_rows_in_set, settings.max_bytes_in_set, settings.set_overflow_mode));
TiDB::TiDBCollators collators;
collators.push_back(getCollatorFromExpr(expr));
set->setCollators(collators);
SetPtr set = std::make_shared<Set>(
SizeLimits(settings.max_rows_in_set, settings.max_bytes_in_set, settings.set_overflow_mode),
TiDB::TiDBCollators{getCollatorFromExpr(expr)});

auto remaining_exprs = set->createFromDAGExpr(set_element_types, expr, create_ordered_set);
prepared_sets[&expr] = std::make_shared<DAGSet>(std::move(set), std::move(remaining_exprs));
}
Expand Down
Loading

0 comments on commit 1a50e00

Please sign in to comment.