diff --git a/CMakeLists.txt b/CMakeLists.txt index e989427d99a..d7c00eb2e9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -511,6 +511,11 @@ if (ARCH_AMD64) else() add_definitions(-DTIFLASH_COMPILER_VPCLMULQDQ_SUPPORT=0) endif() + + check_cxx_compiler_flag("-mmovbe" TIFLASH_COMPILER_MOVBE_SUPPORT) + if (TIFLASH_COMPILER_MOVBE_SUPPORT) + set(COMPILER_MOVBE_FLAG "-mmovbe") + endif() else() add_definitions(-DTIFLASH_COMPILER_VPCLMULQDQ_SUPPORT=0) endif() diff --git a/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index 9ce7060a9eb..5cfe7ded022 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -193,28 +193,28 @@ struct SingleValueDataString Int32 size = -1; /// -1 indicates that there is no value. Int32 capacity = 0; /// power of two or zero - char * large_data; - TiDB::TiDBCollatorPtr collator = nullptr; + char * large_data{}; + TiDB::TiDBCollatorPtr collator{}; bool less(const StringRef & a, const StringRef & b) const { - if (collator == nullptr) + if (unlikely(collator == nullptr)) return a < b; - return collator->compare(a.data, a.size, b.data, b.size) < 0; + return collator->compareFastPath(a.data, a.size, b.data, b.size) < 0; } bool greater(const StringRef & a, const StringRef & b) const { - if (collator == nullptr) + if (unlikely(collator == nullptr)) return a > b; - return collator->compare(a.data, a.size, b.data, b.size) > 0; + return collator->compareFastPath(a.data, a.size, b.data, b.size) > 0; } bool equalTo(const StringRef & a, const StringRef & b) const { - if (collator == nullptr) + if (unlikely(collator == nullptr)) return a == b; - return collator->compare(a.data, a.size, b.data, b.size) == 0; + return collator->compareFastPath(a.data, a.size, b.data, b.size) == 0; } public: @@ -222,7 +222,7 @@ struct SingleValueDataString static constexpr Int32 MAX_SMALL_STRING_SIZE = AUTOMATIC_STORAGE_SIZE - sizeof(size) - sizeof(capacity) - sizeof(large_data) - sizeof(collator); private: - char small_data[MAX_SMALL_STRING_SIZE]; /// Including the terminating zero. + char small_data[MAX_SMALL_STRING_SIZE]{}; /// Including the terminating zero. public: bool has() const diff --git a/dbms/src/AggregateFunctions/IAggregateFunction.h b/dbms/src/AggregateFunctions/IAggregateFunction.h index 646be9c928f..4bf308dc21f 100644 --- a/dbms/src/AggregateFunctions/IAggregateFunction.h +++ b/dbms/src/AggregateFunctions/IAggregateFunction.h @@ -439,7 +439,7 @@ struct AggregationCollatorsWrapper if (likely(collators.size() > column_index)) { if (collators[column_index] != nullptr) - return collators[column_index]->sortKey(in.data, in.size, sort_key_containers[column_index]); + return collators[column_index]->sortKeyFastPath(in.data, in.size, sort_key_containers[column_index]); return in; } else if (collators.empty()) diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 5b184ef8983..990e5c80290 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -94,7 +95,7 @@ void ColumnString::insertRangeFrom(const IColumn & src, size_t start, size_t len size_t old_chars_size = chars.size(); chars.resize(old_chars_size + nested_length); - memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); + inline_memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length); if (start == 0 && offsets.empty()) { @@ -315,12 +316,10 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn & { const auto & rhs = static_cast(rhs_); - return collator.compare( - reinterpret_cast(&chars[offsetAt(n)]), - sizeAt(n) - 1, // Skip last zero byte. - reinterpret_cast(&rhs.chars[rhs.offsetAt(m)]), - rhs.sizeAt(m) - 1 // Skip last zero byte. - ); + auto a = getDataAt(n); + auto b = rhs.getDataAt(m); + + return collator.compare(a.data, a.size, b.data, b.size); } // Derived must implement function `int compare(const char *, size_t, const char *, size_t)`. diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index 2204319e090..ffea54625e9 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -211,10 +211,10 @@ class ColumnString final : public COWPtrHelper StringRef res; - if (collator != nullptr) + if (likely(collator != nullptr)) { // Skip last zero byte. - auto sort_key = collator->sortKey(reinterpret_cast(src), string_size - 1, sort_key_container); + auto sort_key = collator->sortKeyFastPath(reinterpret_cast(src), string_size - 1, sort_key_container); string_size = sort_key.size; src = sort_key.data; } @@ -244,10 +244,10 @@ class ColumnString final : public COWPtrHelper { size_t string_size = sizeAt(n); size_t offset = offsetAt(n); - if (collator != nullptr) + if (likely(collator != nullptr)) { // Skip last zero byte. - auto sort_key = collator->sortKey(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); + auto sort_key = collator->sortKeyFastPath(reinterpret_cast(&chars[offset]), string_size - 1, sort_key_container); string_size = sort_key.size; hash.update(reinterpret_cast(&string_size), sizeof(string_size)); hash.update(sort_key.data, sort_key.size); @@ -278,16 +278,7 @@ class ColumnString final : public COWPtrHelper int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override { const auto & rhs = static_cast(rhs_); - - const size_t size = sizeAt(n); - const size_t rhs_size = rhs.sizeAt(m); - - int cmp = memcmp(&chars[offsetAt(n)], &rhs.chars[rhs.offsetAt(m)], std::min(size, rhs_size)); - - if (cmp != 0) - return cmp; - else - return size > rhs_size ? 1 : (size < rhs_size ? -1 : 0); + return getDataAtWithTerminatingZero(n).compare(rhs.getDataAtWithTerminatingZero(m)); } int compareAt(size_t n, size_t m, const IColumn & rhs_, int, const ICollator & collator) const override diff --git a/dbms/src/Columns/ColumnsCommon.cpp b/dbms/src/Columns/ColumnsCommon.cpp index 0cfa6eeae29..da468c86505 100644 --- a/dbms/src/Columns/ColumnsCommon.cpp +++ b/dbms/src/Columns/ColumnsCommon.cpp @@ -12,13 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if __SSE2__ -#include -#endif - #include #include - +#include namespace DB { @@ -146,7 +142,7 @@ struct ResultOffsetsBuilder { const auto offsets_size_old = res_offsets.size(); res_offsets.resize(offsets_size_old + SIMD_BYTES); - memcpy(&res_offsets[offsets_size_old], src_offsets_pos, SIMD_BYTES * sizeof(IColumn::Offset)); + inline_memcpy(&res_offsets[offsets_size_old], src_offsets_pos, SIMD_BYTES * sizeof(IColumn::Offset)); if (!first) { @@ -194,7 +190,7 @@ void filterArraysImplGeneric( { const size_t size = src_offsets.size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(fmt::format("size of filter {} doesn't match size of column {}", filt.size(), size), ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); ResultOffsetsBuilder result_offsets_builder(res_offsets); @@ -223,7 +219,7 @@ void filterArraysImplGeneric( const auto elems_size_old = res_elems.size(); res_elems.resize(elems_size_old + size); - memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T)); + inline_memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T)); }; #if __SSE2__ @@ -233,7 +229,7 @@ void filterArraysImplGeneric( while (filt_pos < filt_end_aligned) { - const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8( + uint32_t mask = _mm_movemask_epi8(_mm_cmpgt_epi8( _mm_loadu_si128(reinterpret_cast(filt_pos)), zero_vec)); @@ -254,13 +250,16 @@ void filterArraysImplGeneric( /// copy elements for SIMD_BYTES arrays at once const auto elems_size_old = res_elems.size(); res_elems.resize(elems_size_old + chunk_size); - memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); + inline_memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); } else { - for (size_t i = 0; i < SIMD_BYTES; ++i) - if (filt_pos[i]) - copy_array(offsets_pos + i); + while (mask) + { + size_t index = __builtin_ctz(mask); + copy_array(offsets_pos + index); + mask = mask & (mask - 1); + } } filt_pos += SIMD_BYTES; diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 525a7f5ab4d..dbf50175007 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -72,7 +73,7 @@ struct HashMethodOneNumber using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t /// Is used for default implementation in HashMethodBase. - FieldType getKeyHolder(size_t row, Arena *, std::vector &) const + ALWAYS_INLINE inline FieldType getKeyHolder(size_t row, Arena *, std::vector &) const { if constexpr (std::is_same_v) return vec[row]; @@ -99,7 +100,7 @@ struct HashMethodString HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const ColumnString & column_string = assert_cast(column); + const auto & column_string = assert_cast(column); offsets = column_string.getOffsets().data(); chars = column_string.getChars().data(); if (!collators.empty()) @@ -110,15 +111,15 @@ struct HashMethodString } } - auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const + ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const { auto last_offset = row == 0 ? 0 : offsets[row - 1]; StringRef key(chars + last_offset, offsets[row] - last_offset - 1); if constexpr (place_string_to_arena) { - if (collator) - key = collator->sortKey(key.data, key.size, sort_key_containers[0]); + if (likely(collator)) + key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]); return ArenaKeyHolder{key, *pool}; } else @@ -127,6 +128,98 @@ struct HashMethodString } } +protected: + friend class columns_hashing_impl::HashMethodBase; +}; + +/// For the case when there is multi string key. +template +struct HashMethodMultiString + : public columns_hashing_impl::HashMethodBase, Value, Mapped, false> +{ + using Self = HashMethodMultiString; + using Base = columns_hashing_impl::HashMethodBase; + + std::vector offsets; + std::vector chars; + TiDB::TiDBCollators collators; + bool all_collators_padding_bin = false; + + HashMethodMultiString(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators & collators_) + : collators(collators_) + { + size_t num = key_columns.size(); + offsets.resize(num); + chars.resize(num); + + for (size_t i = 0; i < num; ++i) + { + const IColumn & column = *key_columns[i]; + const auto & column_string = assert_cast(column); + offsets[i] = column_string.getOffsets().data(); + chars[i] = column_string.getChars().data(); + } + if (!collators.empty()) + { + all_collators_padding_bin = std::all_of(collators.begin(), collators.end(), [](auto & x) { + return x->isPaddingBinary(); + }); + } + } + + template + ALWAYS_INLINE inline SerializedKeyHolder genSerializedKeyHolder(ssize_t row, Arena * pool, F && fn_handle_key) const + { + auto num = offsets.size(); + + static_assert(std::is_same_v(0)->size)>); + + const char * begin = nullptr; + size_t sum_size = 0; + + for (size_t key_index = 0; key_index < num; ++key_index) + { + auto last_offset = row == 0 ? 0 : offsets[key_index][row - 1]; + StringRef key(chars[key_index] + last_offset, offsets[key_index][row] - last_offset - 1); + + key = fn_handle_key(key_index, key); + + char * pos = pool->allocContinue(key.size + sizeof(key.size), begin); + { + memcpy(pos, &key.size, sizeof(key.size)); + inline_memcpy(pos + sizeof(key.size), key.data, key.size); + } + + sum_size += key.size + sizeof(key.size); + } + return SerializedKeyHolder{{begin, sum_size}, *pool}; + } + + ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector & sort_key_containers) const + { + if (likely(all_collators_padding_bin)) + { + return genSerializedKeyHolder(row, pool, [](size_t, StringRef key) { + return DB::BinCollatorSortKey(key.data, key.size); + }); + } + + if (unlikely(collators.empty())) + { + return genSerializedKeyHolder(row, pool, [](size_t, StringRef key) { + return key; + }); + } + else + { + return genSerializedKeyHolder(row, pool, [&](size_t key_index, StringRef key) { + if (collators[key_index]) + return collators[key_index]->sortKey(key.data, key.size, sort_key_containers[key_index]); + return key; + }); + } + } + protected: friend class columns_hashing_impl::HashMethodBase; }; @@ -147,20 +240,20 @@ struct HashMethodFixedString HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators) { const IColumn & column = *key_columns[0]; - const ColumnFixedString & column_string = assert_cast(column); + const auto & column_string = assert_cast(column); n = column_string.getN(); chars = &column_string.getChars(); if (!collators.empty()) collator = collators[0]; } - auto getKeyHolder(size_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const + ALWAYS_INLINE inline auto getKeyHolder(size_t row, [[maybe_unused]] Arena * pool, std::vector & sort_key_containers) const { StringRef key(&(*chars)[row * n], n); if (collator) { - key = collator->sortKey(key.data, key.size, sort_key_containers[0]); + key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]); } if constexpr (place_string_to_arena) @@ -280,7 +373,7 @@ struct HashMethodKeysFixed #endif } - ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector &) const + ALWAYS_INLINE inline Key getKeyHolder(size_t row, Arena *, std::vector &) const { if constexpr (has_nullable_keys) { @@ -357,7 +450,7 @@ struct HashMethodSerialized , collators(collators_) {} - ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena * pool, std::vector & sort_key_containers) const + ALWAYS_INLINE inline SerializedKeyHolder getKeyHolder(size_t row, Arena * pool, std::vector & sort_key_containers) const { return SerializedKeyHolder{ serializeKeysToPoolContiguous(row, keys_size, key_columns, collators, sort_key_containers, *pool), @@ -385,7 +478,7 @@ struct HashMethodHashed , collators(collators_) {} - ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector & sort_key_containers) const + ALWAYS_INLINE inline Key getKeyHolder(size_t row, Arena *, std::vector & sort_key_containers) const { return hash128(row, key_columns.size(), key_columns, collators, sort_key_containers); } diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h index 8d8da4318e8..fff3d7a9801 100644 --- a/dbms/src/Common/ColumnsHashingImpl.h +++ b/dbms/src/Common/ColumnsHashingImpl.h @@ -128,21 +128,21 @@ class HashMethodBase using Cache = LastElementCache; template - ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) + ALWAYS_INLINE inline EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) { auto key_holder = static_cast(*this).getKeyHolder(row, &pool, sort_key_containers); return emplaceImpl(key_holder, data); } template - ALWAYS_INLINE FindResult findKey(Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) + ALWAYS_INLINE inline FindResult findKey(Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) { auto key_holder = static_cast(*this).getKeyHolder(row, &pool, sort_key_containers); return findKeyImpl(keyHolderGetKey(key_holder), data); } template - ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) + ALWAYS_INLINE inline size_t getHash(const Data & data, size_t row, Arena & pool, std::vector & sort_key_containers) { auto key_holder = static_cast(*this).getKeyHolder(row, &pool, sort_key_containers); return data.hash(keyHolderGetKey(key_holder)); @@ -167,7 +167,7 @@ class HashMethodBase } template - ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data) + ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data) { if constexpr (Cache::consecutive_keys_optimization) { @@ -220,7 +220,7 @@ class HashMethodBase } template - ALWAYS_INLINE FindResult findKeyImpl(Key key, Data & data) + ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data) { if constexpr (Cache::consecutive_keys_optimization) { diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt index eb45bd772d6..39d3e687721 100644 --- a/dbms/src/Functions/CMakeLists.txt +++ b/dbms/src/Functions/CMakeLists.txt @@ -20,7 +20,7 @@ add_headers_and_sources(clickhouse_functions ./Conditional) add_headers_and_sources(clickhouse_functions ${TiFlash_BINARY_DIR}/dbms/src/Functions) if (TIFLASH_ENABLE_AVX_SUPPORT) - set_source_files_properties(CollationStringOptimized.cpp APPEND COMPILE_FLAGS "-mavx -mavx2") + set_source_files_properties(CollationStringOptimized.cpp APPEND COMPILE_FLAGS "-mavx -mavx2 ${COMPILER_MOVBE_FLAG}") endif () list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp) diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h index 84364e74369..fda8a113ada 100644 --- a/dbms/src/Functions/FunctionsComparison.h +++ b/dbms/src/Functions/FunctionsComparison.h @@ -364,7 +364,7 @@ struct StringComparisonWithCollatorImpl size_t a_n = a.size(); size_t b_n = b.size(); - int res = collator->compare(reinterpret_cast(a.data()), a_n, reinterpret_cast(b.data()), b_n); + int res = collator->compareFastPath(reinterpret_cast(a.data()), a_n, reinterpret_cast(b.data()), b_n); c = Op::apply(res, 0); } }; diff --git a/dbms/src/Functions/LeastGreatest.h b/dbms/src/Functions/LeastGreatest.h index 9ca126a2ada..55e76b66524 100644 --- a/dbms/src/Functions/LeastGreatest.h +++ b/dbms/src/Functions/LeastGreatest.h @@ -13,7 +13,7 @@ // limitations under the License. #pragma once -#include + #include #include #include diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 6cb947a1bfa..f5311a6ab0f 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -303,6 +304,18 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethod() if (params.keys_size == 1 && types_removed_nullable[0]->isString()) return AggregatedDataVariants::Type::key_string; + if (params.keys_size > 1 && types_removed_nullable[0]->isString()) + { + bool is_all_str = std::all_of(types_removed_nullable.data(), types_removed_nullable.data() + params.keys_size, [](const auto & x) { + return x->isString(); + }); + + if (is_all_str) + { + return AggregatedDataVariants::Type::multi_key_string; + } + } + if (params.keys_size == 1 && types_removed_nullable[0]->isFixedString()) return AggregatedDataVariants::Type::key_fixed_string; @@ -361,7 +374,7 @@ void NO_INLINE Aggregator::executeImpl( } template -void NO_INLINE Aggregator::executeImplBatch( +ALWAYS_INLINE void Aggregator::executeImplBatch( Method & method, typename Method::State & state, Arena * aggregates_pool, diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index 052e3dbdcbb..688298cb14d 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -188,7 +188,7 @@ struct AggregationMethodStringNoCache AggregationMethodStringNoCache() = default; template - AggregationMethodStringNoCache(const Other & other) + explicit AggregationMethodStringNoCache(const Other & other) : data(other.data) {} @@ -202,6 +202,35 @@ struct AggregationMethodStringNoCache } }; +/// Same as above but without cache +template +struct AggregationMethodMultiStringNoCache +{ + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + + Data data; + + AggregationMethodMultiStringNoCache() = default; + + template + explicit AggregationMethodMultiStringNoCache(const Other & other) + : data(other.data) + {} + + using State = ColumnsHashing::HashMethodMultiString; + + std::optional shuffleKeyColumns(std::vector &, const Sizes &) { return {}; } + + static void insertKeyIntoColumns(const StringRef & key, std::vector & key_columns, const Sizes &, const TiDB::TiDBCollators &) + { + const auto * pos = key.data; + for (auto & key_column : key_columns) + pos = static_cast(key_column)->deserializeAndInsertFromArena(pos, nullptr); + } +}; + /// For the case where there is one fixed-length string key. template struct AggregationMethodFixedString @@ -418,6 +447,7 @@ struct AggregatedDataVariants : private boost::noncopyable std::unique_ptr> key64; std::unique_ptr> key_int256; std::unique_ptr> key_string; + std::unique_ptr> multi_key_string; std::unique_ptr> key_fixed_string; std::unique_ptr> keys16; std::unique_ptr> keys32; @@ -430,6 +460,7 @@ struct AggregatedDataVariants : private boost::noncopyable std::unique_ptr> key64_two_level; std::unique_ptr> key_int256_two_level; std::unique_ptr> key_string_two_level; + std::unique_ptr> multi_key_string_two_level; std::unique_ptr> key_fixed_string_two_level; std::unique_ptr> keys32_two_level; std::unique_ptr> keys64_two_level; @@ -457,6 +488,7 @@ struct AggregatedDataVariants : private boost::noncopyable M(key32, false) \ M(key64, false) \ M(key_string, false) \ + M(multi_key_string, false) \ M(key_fixed_string, false) \ M(keys16, false) \ M(keys32, false) \ @@ -469,6 +501,7 @@ struct AggregatedDataVariants : private boost::noncopyable M(key64_two_level, true) \ M(key_int256_two_level, true) \ M(key_string_two_level, true) \ + M(multi_key_string_two_level, true) \ M(key_fixed_string_two_level, true) \ M(keys32_two_level, true) \ M(keys64_two_level, true) \ @@ -628,6 +661,7 @@ struct AggregatedDataVariants : private boost::noncopyable M(keys128) \ M(keys256) \ M(serialized) \ + M(multi_key_string) \ M(nullable_keys128) \ M(nullable_keys256) @@ -675,6 +709,7 @@ struct AggregatedDataVariants : private boost::noncopyable M(keys128_two_level) \ M(keys256_two_level) \ M(serialized_two_level) \ + M(multi_key_string_two_level) \ M(nullable_keys128_two_level) \ M(nullable_keys256_two_level) }; diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h index 3e7410023bb..7236e21dd62 100644 --- a/dbms/src/Storages/Transaction/Collator.h +++ b/dbms/src/Storages/Transaction/Collator.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include @@ -83,6 +84,16 @@ class ITiDBCollator : public ICollator ~ITiDBCollator() override = default; int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0; + + ALWAYS_INLINE inline int compareFastPath(const char * s1, size_t length1, const char * s2, size_t length2) const + { + if (likely(isPaddingBinary())) + { + return DB::BinCollatorCompare(s1, length1, s2, length2); + } + return compare(s1, length1, s2, length2); + } + virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0; virtual std::unique_ptr pattern() const = 0; int32_t getCollatorId() const { return collator_id; } @@ -90,6 +101,38 @@ class ITiDBCollator : public ICollator bool isBinary() const; bool isCI() const; + ALWAYS_INLINE static inline bool isPaddingBinary(CollatorType collator_type) + { + switch (collator_type) + { + case CollatorType::UTF8MB4_BIN: + case CollatorType::UTF8_BIN: + case CollatorType::LATIN1_BIN: + case CollatorType::ASCII_BIN: + { + // collator_type < 4 + return true; + } + default: + break; + } + return false; + } + + ALWAYS_INLINE inline bool isPaddingBinary() const + { + return isPaddingBinary(getCollatorType()); + } + + ALWAYS_INLINE inline StringRef sortKeyFastPath(const char * s, size_t length, std::string & container) const + { + if (likely(isPaddingBinary())) + { + return DB::BinCollatorSortKey(s, length); + } + return sortKey(s, length, container); + } + protected: explicit ITiDBCollator(int32_t collator_id_); int32_t collator_id; // collator id to be compatible with TiDB diff --git a/dbms/src/Storages/Transaction/CollatorCompare.h b/dbms/src/Storages/Transaction/CollatorCompare.h new file mode 100644 index 00000000000..2124d9ac839 --- /dev/null +++ b/dbms/src/Storages/Transaction/CollatorCompare.h @@ -0,0 +1,98 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include +#include + +namespace DB +{ + +template +ALWAYS_INLINE inline int signum(T val) +{ + return (0 < val) - (val < 0); +} + +// Check equality is much faster than other comparison. +// - check size first +// - return 0 if equal else 1 +FLATTEN_INLINE_PURE static inline int RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) +{ + return mem_utils::IsStrViewEqual(lhs, rhs) ? 0 : 1; +} + +// Compare str view by memcmp +FLATTEN_INLINE_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) +{ + return mem_utils::CompareStrView(v1, v2); +} + +constexpr char SPACE = ' '; + +FLATTEN_INLINE_PURE inline std::string_view RightTrimRaw(const std::string_view & v) +{ + size_t end = v.find_last_not_of(SPACE); + return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1); +} + +// Remove tail space +FLATTEN_INLINE_PURE inline std::string_view RightTrim(const std::string_view & v) +{ + if (likely(v.empty() || v.back() != SPACE)) + return v; + return RightTrimRaw(v); +} + +FLATTEN_INLINE_PURE inline std::string_view RightTrimNoEmpty(const std::string_view & v) +{ + if (likely(v.back() != SPACE)) + return v; + return RightTrimRaw(v); +} + +FLATTEN_INLINE_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) +{ + return RawStrCompare(RightTrim(va), RightTrim(vb)); +} + +template +FLATTEN_INLINE_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2) +{ + if constexpr (padding) + return DB::RtrimStrCompare({s1, length1}, {s2, length2}); + else + return DB::RawStrCompare({s1, length1}, {s2, length2}); +} + +template +FLATTEN_INLINE_PURE inline StringRef BinCollatorSortKey(const char * s, size_t length) +{ + if constexpr (padding) + { + return StringRef(RightTrim({s, length})); + } + else + { + return StringRef(s, length); + } +} + + +} // namespace DB diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h index 3f53791276c..328feac048f 100644 --- a/dbms/src/Storages/Transaction/CollatorUtils.h +++ b/dbms/src/Storages/Transaction/CollatorUtils.h @@ -14,86 +14,11 @@ #pragma once -#include -#include -#include - -#include -#include +#include namespace DB { -template -ALWAYS_INLINE inline int signum(T val) -{ - return (0 < val) - (val < 0); -} - -// Check equality is much faster than other comparison. -// - check size first -// - return 0 if equal else 1 -FLATTEN_INLINE_PURE static inline int RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs) -{ - return mem_utils::IsStrViewEqual(lhs, rhs) ? 0 : 1; -} - -// Compare str view by memcmp -FLATTEN_INLINE_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2) -{ - return v1.compare(v2); -} - -constexpr char SPACE = ' '; - -FLATTEN_INLINE_PURE inline std::string_view RightTrimRaw(const std::string_view & v) -{ - size_t end = v.find_last_not_of(SPACE); - return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1); -} - -// Remove tail space -FLATTEN_INLINE_PURE inline std::string_view RightTrim(const std::string_view & v) -{ - if (likely(v.empty() || v.back() != SPACE)) - return v; - return RightTrimRaw(v); -} - -FLATTEN_INLINE_PURE inline std::string_view RightTrimNoEmpty(const std::string_view & v) -{ - if (likely(v.back() != SPACE)) - return v; - return RightTrimRaw(v); -} - -FLATTEN_INLINE_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb) -{ - return RawStrCompare(RightTrim(va), RightTrim(vb)); -} - -template -FLATTEN_INLINE_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2) -{ - if constexpr (padding) - return DB::RtrimStrCompare({s1, length1}, {s2, length2}); - else - return DB::RawStrCompare({s1, length1}, {s2, length2}); -} - -template -FLATTEN_INLINE_PURE inline StringRef BinCollatorSortKey(const char * s, size_t length) -{ - if constexpr (padding) - { - return StringRef(RightTrim({s, length})); - } - else - { - return StringRef(s, length); - } -} - // Loop columns and invoke callback for each pair. // Remove last zero byte. template diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp index ed107e99aba..5beda969e01 100644 --- a/dbms/src/Storages/Transaction/TiDB.cpp +++ b/dbms/src/Storages/Transaction/TiDB.cpp @@ -246,7 +246,7 @@ Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const collator = ITiDBCollator::getCollator("binary"); for (const auto & elem : elems) { - if (collator->compare(elem.first.data(), elem.first.size(), enum_id_or_text.data(), enum_id_or_text.size()) == 0) + if (collator->compareFastPath(elem.first.data(), elem.first.size(), enum_id_or_text.data(), enum_id_or_text.size()) == 0) { return elem.second; } @@ -265,12 +265,12 @@ UInt64 ColumnInfo::getSetValue(const String & set_str) const Poco::StringTokenizer string_tokens(set_str, ","); std::set marked; for (const auto & s : string_tokens) - marked.insert(collator->sortKey(s.data(), s.length(), sort_key_container).toString()); + marked.insert(collator->sortKeyFastPath(s.data(), s.length(), sort_key_container).toString()); UInt64 value = 0; for (size_t i = 0; i < elems.size(); i++) { - String key = collator->sortKey(elems.at(i).first.data(), elems.at(i).first.length(), sort_key_container).toString(); + String key = collator->sortKeyFastPath(elems.at(i).first.data(), elems.at(i).first.length(), sort_key_container).toString(); auto it = marked.find(key); if (it != marked.end()) { diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 8138279e275..13ca92f3a52 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -21,9 +21,7 @@ add_subdirectory (libcommon) add_subdirectory (libpocoext) add_subdirectory (libdaemon) -if (USE_INTERNAL_MEMCPY) - add_subdirectory (libmemcpy) -endif() +add_subdirectory (libmemcpy) if (GLIBC_COMPATIBILITY) add_subdirectory (libglibc-compatibility) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 9cb8ec8c146..e5283c086a9 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -160,7 +160,7 @@ endif () if (TIFLASH_ENABLE_AVX_SUPPORT) # https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html set_source_files_properties(src/mem_utils_avx2.cpp APPEND COMPILE_FLAGS "-mavx -mavx2") - set_source_files_properties(src/avx2_mem_utils_impl.cpp APPEND COMPILE_FLAGS "-mavx -mavx2") + set_source_files_properties(src/avx2_mem_utils_impl.cpp APPEND COMPILE_FLAGS "-mavx -mavx2 ${COMPILER_MOVBE_FLAG} -fomit-frame-pointer") if (TIFLASH_COMPILER_VPCLMULQDQ_SUPPORT) set_source_files_properties(src/crc64_avx2.cpp APPEND COMPILE_FLAGS "-mavx2 -mpclmul -mvpclmulqdq -Wno-ignored-attributes") diff --git a/libs/libcommon/include/common/StringRef.h b/libs/libcommon/include/common/StringRef.h index f43c18370d3..d03bd643786 100644 --- a/libs/libcommon/include/common/StringRef.h +++ b/libs/libcommon/include/common/StringRef.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -72,6 +73,11 @@ struct StringRef explicit operator std::string() const { return toString(); } constexpr explicit operator std::string_view() const { return {data, size}; } + + ALWAYS_INLINE inline int compare(const StringRef & tar) const + { + return mem_utils::CompareStrView({*this}, {tar}); + } }; /// Here constexpr doesn't implicate inline, see https://www.viva64.com/en/w/v1043/ @@ -84,16 +90,10 @@ using StringRefs = std::vector; // According to https://github.com/pingcap/tiflash/pull/5658 // - if size of memory area is bigger than 1M, instructions about avx512 may begin to get better results -// - otherwise, use `std::string_view == std::string_view` or `mem_utils::avx2_mem_equal`(under x86-64 with avx2) +// - otherwise, use `mem_utils::avx2_mem_equal`(under x86-64 with avx2) inline bool operator==(StringRef lhs, StringRef rhs) { - if (lhs.size != rhs.size) - return false; - - if (lhs.size == 0) - return true; - - return mem_utils::memoryEqual(lhs.data, rhs.data, lhs.size); + return mem_utils::IsStrViewEqual({lhs}, {rhs}); } inline bool operator!=(StringRef lhs, StringRef rhs) @@ -103,14 +103,12 @@ inline bool operator!=(StringRef lhs, StringRef rhs) inline bool operator<(StringRef lhs, StringRef rhs) { - int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size)); - return cmp < 0 || (cmp == 0 && lhs.size < rhs.size); + return lhs.compare(rhs) < 0; } inline bool operator>(StringRef lhs, StringRef rhs) { - int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size)); - return cmp > 0 || (cmp == 0 && lhs.size > rhs.size); + return lhs.compare(rhs) > 0; } diff --git a/libs/libcommon/include/common/avx2_mem_utils.h b/libs/libcommon/include/common/avx2_mem_utils.h index 822fb4b7a40..03e078f2f0a 100644 --- a/libs/libcommon/include/common/avx2_mem_utils.h +++ b/libs/libcommon/include/common/avx2_mem_utils.h @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace mem_utils::details @@ -40,7 +41,7 @@ ALWAYS_INLINE static inline T clear_rightmost_bit_one(const T value) ALWAYS_INLINE static inline uint32_t rightmost_bit_one_index(const uint32_t value) { assert(value != 0); - return _tzcnt_u32(value); + return __builtin_ctz(value); } using Block32 = __m256i; @@ -91,20 +92,6 @@ FLATTEN_INLINE_PURE static inline int cmp_block1(const void * p1, const void * p return int32_t(read(p1)) - int32_t(read(p2)); } -FLATTEN_INLINE_PURE static inline int cmp_block8(const void * p1, const void * p2) -{ - // the left most bit may be 1, use std::memcmp(,,8) to use `sbb` - /* - bswap rcx - bswap rdx - xor eax, eax - cmp rcx, rdx - seta al - sbb eax, 0 - */ - return std::memcmp(p1, p2, 8); -} - FLATTEN_INLINE_PURE static inline int cmp_block16(const char * p1, const char * p2) { uint32_t mask = get_block16_cmp_eq_mask(p1, p2); // mask is up to 0xffff @@ -112,18 +99,37 @@ FLATTEN_INLINE_PURE static inline int cmp_block16(const char * p1, const char * if (unlikely(mask != 0)) { auto pos = rightmost_bit_one_index(mask); - return cmp_block1(p1 + pos, p2 + pos); + int ret = cmp_block1(p1 + pos, p2 + pos); + if (ret == 0) + { + __builtin_unreachable(); + } + else + { + return ret; + } } return 0; } -FLATTEN_INLINE_PURE static inline int cmp_block32(const char * p1, const char * p2) + +template +FLATTEN_INLINE_PURE static inline int cmp_block32(const char * p1, + const char * p2) { uint32_t mask = get_block32_cmp_eq_mask(p1, p2); // mask is up to 0xffffffff mask -= Block32Mask; - if (unlikely(mask != 0)) + if (must_not_eq || unlikely(mask != 0)) { auto pos = rightmost_bit_one_index(mask); - return cmp_block1(p1 + pos, p2 + pos); + int ret = cmp_block1(p1 + pos, p2 + pos); + if (ret == 0) + { + __builtin_unreachable(); + } + else + { + return ret; + } } return 0; } @@ -150,42 +156,66 @@ FLATTEN_INLINE_PURE static inline bool check_block32x4_eq(const char * a, const FLATTEN_INLINE_PURE static inline int cmp_block32x4(const char * a, const char * b) { - if (check_block32x4_eq(a, b)) + if (likely(check_block32x4_eq(a, b))) return 0; - for (size_t i = 0; i < AVX2_UNROLL_NUM - 1; ++i) + for (size_t i = 0; i < (AVX2_UNROLL_NUM - 1); ++i) { - if (auto ret = cmp_block32(a + i * BLOCK32_SIZE, (b + i * BLOCK32_SIZE)); ret) + if (auto ret = cmp_block32(a + i * BLOCK32_SIZE, (b + i * BLOCK32_SIZE)); unlikely(ret)) return ret; } - return cmp_block32(a + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE, (b + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE)); -} -FLATTEN_INLINE_PURE static inline uint32_t swap_u32(uint32_t val) -{ - return __builtin_bswap32(val); -} -FLATTEN_INLINE_PURE static inline uint64_t swap_u64(uint64_t val) -{ - return __builtin_bswap64(val); -} - -[[maybe_unused]] FLATTEN_INLINE_PURE static inline uint32_t read_u32_swap(const void * data) -{ - return swap_u32(read(data)); -} - -[[maybe_unused]] FLATTEN_INLINE_PURE static inline uint64_t read_u64_swap(const void * data) -{ - return swap_u64(read(data)); + return cmp_block32(a + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE, (b + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE)); } // ref: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S -FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * p2, size_t n) +FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * p2, size_t n) noexcept { constexpr size_t loop_block32x4_size = AVX2_UNROLL_NUM * BLOCK32_SIZE; // n <= 32 if (likely(n <= BLOCK32_SIZE)) { +#if !defined(AVX2_MEM_CMP_NORMAL_IF_ELSE) + +#ifdef M + static_assert(false, "`M` is defined"); +#else +#define M(x) \ + case (x): \ + { \ + return __builtin_memcmp(p1, p2, (x)); \ + } +#endif + switch (n) + { + M(0); + M(1); + M(2); + M(3); + M(4); + M(5); + M(6); + M(7); + M(8); + M(9); + M(10); + M(11); + M(12); + M(13); + M(14); + M(15); + M(16); + default: + { + // 17~32 + if (auto ret = cmp_block16(p1, p2); ret) + return ret; + return cmp_block16(p1 + n - BLOCK16_SIZE, p2 + n - BLOCK16_SIZE); + } + } +#undef M + +#else + // an optional way to check small str if (unlikely(n < 2)) { // 0~1 @@ -245,6 +275,7 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * return ret; return cmp_block16(p1 + n - BLOCK16_SIZE, p2 + n - BLOCK16_SIZE); } +#endif } // 8 * 32 < n if (unlikely(8 * BLOCK32_SIZE < n)) @@ -255,10 +286,10 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * return ret; { // align addr of one data pointer - auto offset = BLOCK32_SIZE - OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE); - p1 += offset; - p2 += offset; - n -= offset; + auto offset = ssize_t(OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE)) - BLOCK32_SIZE; + p1 -= offset; + p2 -= offset; + n += offset; } for (; n >= loop_block32x4_size;) @@ -312,13 +343,15 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * } } -FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const char * p2, size_t n) +FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const char * p2, size_t n) noexcept { constexpr size_t loop_block32x4_size = AVX2_UNROLL_NUM * BLOCK32_SIZE; // n <= 32 if (likely(n <= BLOCK32_SIZE)) { +#if !defined(AVX2_MEM_EQ_NORMAL_IF_ELSE) + #ifdef M static_assert(false, "`M` is defined"); #else @@ -359,8 +392,8 @@ FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const cha } #undef M -// an optional way to check small str -#if defined(AVX2_MEM_EQ_NORMAL_IF_ELSE) +#else + // an optional way to check small str if (unlikely(n < 2)) { // 0~1 @@ -415,10 +448,10 @@ FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const cha return false; { // align addr of one data pointer - auto offset = BLOCK32_SIZE - OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE); - p1 += offset; - p2 += offset; - n -= offset; + auto offset = ssize_t(OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE)) - BLOCK32_SIZE; + p1 -= offset; + p2 -= offset; + n += offset; } for (; n >= loop_block32x4_size;) diff --git a/libs/libcommon/include/common/mem_utils_opt.h b/libs/libcommon/include/common/mem_utils_opt.h index 66057ccdd51..e691a1563aa 100644 --- a/libs/libcommon/include/common/mem_utils_opt.h +++ b/libs/libcommon/include/common/mem_utils_opt.h @@ -93,8 +93,7 @@ FLATTEN_INLINE_PURE static inline int CompareStrView(const std::string_view & lh if (ret == 0) { - auto a = lhs.size(), b = rhs.size(); - ret = (a == b) ? 0 : (a < b ? -1 : 1); + ret = (lhs.size() == rhs.size()) ? 0 : (lhs.size() < rhs.size() ? -1 : 1); } return ret; #else diff --git a/libs/libcommon/include/common/memcpy.h b/libs/libcommon/include/common/memcpy.h new file mode 100644 index 00000000000..fbdbc0f1527 --- /dev/null +++ b/libs/libcommon/include/common/memcpy.h @@ -0,0 +1,32 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#if defined(__SSE2__) +#include +#endif + +ALWAYS_INLINE static inline void * inline_memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ +#if defined(__SSE2__) + return sse2_inline_memcpy(dst, src, size); +#else + return std::memcpy(dst, src, size); +#endif +} diff --git a/libs/libcommon/include/common/sse2_memcpy.h b/libs/libcommon/include/common/sse2_memcpy.h new file mode 100644 index 00000000000..2919bb0b866 --- /dev/null +++ b/libs/libcommon/include/common/sse2_memcpy.h @@ -0,0 +1,139 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include +#include + +// Custom inline memcpy implementation for TiFlash. +// - it is recommended to use for inline function with `sse2` supported +// - it perform better than `legacy::inline_memcpy`(from clickhouse) according to `libs/libcommon/src/tests/bench_memcpy.cpp` +// - like `std::memcpy`, the behavior is undefined when the source and the destination objects overlap +// - moving data from register to memory costs more than the reversed way, so it's useful to reduce times about memory copying. +ALWAYS_INLINE static inline void * sse2_inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size) +{ + char * __restrict dst = reinterpret_cast(dst_); + const char * __restrict src = reinterpret_cast(src_); + + void * ret = dst; + +#if defined(MCP) || defined(MCP_END) + static_assert(false); +#endif + +#define MCP_END(n) tiflash_compiler_builtin_memcpy(dst + size - (n), src + size - (n), (n)); +#define MCP(n) tiflash_compiler_builtin_memcpy(dst, src, (n)); + + if (likely(size <= 32)) + { + if (unlikely(size <= 1)) + { + if (likely(size == 1)) + { + /// A single byte. + *dst = *src; + } + /// No bytes remaining. + } + else if (unlikely(size <= 4)) + { + /// Chunks of 2..4 bytes. + MCP(2); + MCP_END(2); + } + else if (unlikely(size <= 8)) + { + /// Chunks of 5..8 bytes. + MCP(4); + MCP_END(4); + } + else if (unlikely(size <= 16)) + { + /// Chunks of 9..16 bytes. + MCP(8); + MCP_END(8); + } + else + { + /// Chunks of 17..32 bytes. + MCP(16); + MCP_END(16); + } + } + else + { + if (unlikely(size > 128)) + { + /// Large size with fully unrolled loop. + { + MCP(16); + + // reduce instruction: `offset` = or (`dst`, 0xfffffffffffffff0) + auto offset = ssize_t(size_t(dst) % 16) - 16; + dst -= offset; + src -= offset; + size += offset; + } + + /// Aligned unrolled copy. + __m128i c0, c1, c2, c3, c4, c5, c6, c7; + + while (size >= 128) + { + c0 = _mm_loadu_si128(reinterpret_cast(src) + 0); + c1 = _mm_loadu_si128(reinterpret_cast(src) + 1); + c2 = _mm_loadu_si128(reinterpret_cast(src) + 2); + c3 = _mm_loadu_si128(reinterpret_cast(src) + 3); + c4 = _mm_loadu_si128(reinterpret_cast(src) + 4); + c5 = _mm_loadu_si128(reinterpret_cast(src) + 5); + c6 = _mm_loadu_si128(reinterpret_cast(src) + 6); + c7 = _mm_loadu_si128(reinterpret_cast(src) + 7); + src += 128; + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 0), c0); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 1), c1); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 2), c2); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 3), c3); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 4), c4); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 5), c5); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 6), c6); + _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 7), c7); + dst += 128; + + size -= 128; + } + } + + // size <= 128 + + while (size > 16) + { + MCP(16); + + dst += 16; + src += 16; + size -= 16; + } + + // size <= 16 + MCP_END(16); + } + return ret; + +#undef MCP +#undef MCP_END +} diff --git a/libs/libcommon/src/tests/CMakeLists.txt b/libs/libcommon/src/tests/CMakeLists.txt index 3383ee50da2..43d33bff010 100644 --- a/libs/libcommon/src/tests/CMakeLists.txt +++ b/libs/libcommon/src/tests/CMakeLists.txt @@ -40,15 +40,21 @@ add_executable (gtests_libcommon gtest_logger.cpp gtest_arithmetic_overflow.cpp ) -target_link_libraries (gtests_libcommon gtest_main common) +target_link_libraries (gtests_libcommon gtest_main common memcpy) add_check(gtests_libcommon) +set (bench_libcommon_sources bench_mem_utils.cpp) + +if (NOT USE_INTERNAL_MEMCPY) + list (APPEND bench_libcommon_sources bench_memcpy.cpp) +endif () + add_executable(bench_libcommon # TODO: need to fix broken src file if necessary # bench_logger.cpp - bench_mem_utils.cpp + ${bench_libcommon_sources} ) -target_link_libraries(bench_libcommon benchmark::benchmark_main common m) +target_link_libraries(bench_libcommon benchmark::benchmark_main common m memcpy) add_executable (dump_variable dump_variable.cpp) target_link_libraries (dump_variable clickhouse_common_io) diff --git a/libs/libcommon/src/tests/bench_mem_utils.cpp b/libs/libcommon/src/tests/bench_mem_utils.cpp index 7531202ad7f..8ae2c58de30 100644 --- a/libs/libcommon/src/tests/bench_mem_utils.cpp +++ b/libs/libcommon/src/tests/bench_mem_utils.cpp @@ -35,6 +35,7 @@ constexpr size_t RESERVE_OFFSET = 200; constexpr size_t TEST_ALIGN_SIZE = 64; static_assert(RESERVE_OFFSET > TEST_ALIGN_SIZE * 2); constexpr char DEFAULT_INIT_CHAR = '0'; +constexpr char DEFAULT_TEST_CHAR = '1'; static constexpr size_t TEST_ALIGN_OFF_1 = 15; static constexpr size_t TEST_ALIGN_OFF_2 = 31; @@ -74,6 +75,36 @@ class MemUtilsEqual : public benchmark::Fixture } }; +template +class MemUtilsCmp : public benchmark::Fixture +{ +protected: + std::string inner_data1; + std::string inner_data2; + std::string_view data1; + std::string_view data2; + +public: + static constexpr size_t max_size = max_src_size; + + void SetUp(const ::benchmark::State & /*state*/) override + { + inner_data1.resize(max_size + RESERVE_OFFSET, DEFAULT_INIT_CHAR); + inner_data2 = inner_data1; + + { + const auto * src = reinterpret_cast((size_t(inner_data1.data()) + TEST_ALIGN_SIZE - 1) / TEST_ALIGN_SIZE * TEST_ALIGN_SIZE + TEST_ALIGN_OFF_1); // start address not aligned + data1 = {src, max_size}; + } + + { + auto * src = reinterpret_cast((size_t(inner_data2.data()) + TEST_ALIGN_SIZE - 1) / TEST_ALIGN_SIZE * TEST_ALIGN_SIZE + TEST_ALIGN_OFF_2); // start address not aligned + src[max_size - 1] = DEFAULT_TEST_CHAR; + data2 = {src, max_size}; + } + } +}; + template class MemUtilsStrStr : public benchmark::Fixture { @@ -124,6 +155,10 @@ ALWAYS_INLINE static inline bool stl_mem_eq(const char * p1, const char * p2, si { return std::memcmp(p1, p2, n) == 0; // call bcmp@plt } +ALWAYS_INLINE static inline int stl_mem_cmp(const char * p1, const char * p2, size_t n) +{ + return std::memcmp(p1, p2, n); // call memcmp@plt +} NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p) { @@ -132,24 +167,52 @@ NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p) // volatile value is used to prevent compiler optimization for fixed context -#define BENCH_MEM_EQ(name1, name2, func, iter_cnt) \ - BENCHMARK_DEFINE_F(name1, name2) \ - (benchmark::State & state) \ - { \ - [[maybe_unused]] volatile size_t _volatile_flags = 1; \ - [[maybe_unused]] volatile size_t cnt = max_size; \ - for (auto _ : state) \ - { \ - _volatile_flags = func(data1.data(), data2.data(), cnt); \ - if constexpr (varify_res) \ - { \ - if (unlikely(!_volatile_flags)) \ - exit(-1); \ - } \ - } \ - } \ +#define BENCH_MEM_EQ(name1, name2, func, loop_cnt, iter_cnt) \ + BENCHMARK_DEFINE_F(name1, name2) \ + (benchmark::State & state) \ + { \ + [[maybe_unused]] volatile size_t _volatile_flags = 1; \ + [[maybe_unused]] volatile size_t cnt = max_size; \ + for (auto _ : state) \ + { \ + for (size_t i = 0; i < (loop_cnt); ++i) \ + { \ + _volatile_flags = func(data1.data(), data2.data(), cnt > i ? cnt - i : cnt); \ + if constexpr (varify_res) \ + { \ + if (unlikely(!_volatile_flags)) \ + exit(-1); \ + } \ + } \ + } \ + } \ BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt); +#define BENCH_MEM_CMP(name1, name2, func, loop_cnt, iter_cnt) \ + BENCHMARK_DEFINE_F(name1, name2) \ + (benchmark::State & state) \ + { \ + [[maybe_unused]] volatile int _volatile_flags = 1; \ + [[maybe_unused]] volatile size_t cnt = max_size; \ + for (auto _ : state) \ + { \ + for (size_t i = 0; i < (loop_cnt); ++i) \ + { \ + size_t ori = cnt; \ + size_t n = ori > i ? ori - i : ori; \ + size_t diff = ori - n; \ + _volatile_flags = func(data1.data() + diff, data2.data() + diff, n); \ + if constexpr (varify_res) \ + { \ + if (unlikely(!(_volatile_flags < 0))) \ + { \ + exit(-1); \ + } \ + } \ + } \ + } \ + } \ + BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt); #define BENCH_MEM_STRSTR(name1, name2, func, iter_cnt) \ BENCHMARK_DEFINE_F(name1, name2) \ @@ -174,23 +237,50 @@ NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p) BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt); -#define BENCH_MEM_EQ_ALL(max_src_size, iter_cnt) \ - using MemUtilsEqual##_##max_src_size = MemUtilsEqual; \ - BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, stl_mem_eq, stl_mem_eq, iter_cnt) \ - BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, mem_utils_memoryEqual_avx512, mem_utils::memoryEqual, iter_cnt) \ - BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, avx2_mem_equal, mem_utils::avx2_mem_equal, iter_cnt) +#define BENCH_MEM_EQ_ALL_IMPL(id, max_src_size, loop_cnt, iter_cnt) \ + using id = MemUtilsEqual; \ + BENCH_MEM_EQ(id, stl_mem_eq, stl_mem_eq, loop_cnt, iter_cnt) \ + BENCH_MEM_EQ(id, mem_utils_memoryEqual_avx512, mem_utils::memoryEqual, loop_cnt, iter_cnt) \ + BENCH_MEM_EQ(id, avx2_mem_equal, mem_utils::avx2_mem_equal, loop_cnt, iter_cnt) + +#define BENCH_MEM_EQ_IMPL_ID(max_src_size, loop_cnt, iter_cnt) MemUtilsEqual##_##max_src_size##_##loop_cnt + +#define BENCH_MEM_EQ_ALL(max_src_size, loop_cnt, iter_cnt) \ + BENCH_MEM_EQ_ALL_IMPL(BENCH_MEM_EQ_IMPL_ID(max_src_size, loop_cnt, iter_cnt), max_src_size, loop_cnt, iter_cnt) + +#define BENCH_MEM_CMP_ALL_IMPL(id, max_src_size, loop_cnt, iter_cnt) \ + using id = MemUtilsCmp; \ + BENCH_MEM_CMP(id, stl_mem_cmp, stl_mem_cmp, loop_cnt, iter_cnt) \ + BENCH_MEM_CMP(id, avx2_mem_cmp, mem_utils::avx2_mem_cmp, loop_cnt, iter_cnt) + +#define BENCH_MEM_CMP_IMPL_ID(max_src_size, loop_cnt, iter_cnt) MemUtilsCmp##_##max_src_size##_##loop_cnt + +#define BENCH_MEM_CMP_ALL(max_src_size, loop_cnt, iter_cnt) \ + BENCH_MEM_CMP_ALL_IMPL(BENCH_MEM_CMP_IMPL_ID(max_src_size, loop_cnt, iter_cnt), max_src_size, loop_cnt, iter_cnt) #define BENCH_MEM_STRSTR_ALL(max_cnt, max_src_size, max_needle_size, iter_cnt) \ using MemUtilsStrStr##_##max_src_size##_##max_needle_size = MemUtilsStrStr; \ BENCH_MEM_STRSTR(MemUtilsStrStr##_##max_src_size##_##max_needle_size, stl_str_find, stl_str_find, iter_cnt) \ BENCH_MEM_STRSTR(MemUtilsStrStr##_##max_src_size##_##max_needle_size, avx2_strstr, mem_utils::avx2_strstr, iter_cnt) -BENCH_MEM_EQ_ALL(13, 2000) -BENCH_MEM_EQ_ALL(65, 2000) -BENCH_MEM_EQ_ALL(100, 500) -BENCH_MEM_EQ_ALL(10000, 500) -BENCH_MEM_EQ_ALL(100000, 500) -BENCH_MEM_EQ_ALL(1000000, 200) +#define BENCH_MEM_EQ_LOOP 20 + +BENCH_MEM_EQ_ALL(13, BENCH_MEM_EQ_LOOP, 2000) +BENCH_MEM_EQ_ALL(65, BENCH_MEM_EQ_LOOP, 2000) +BENCH_MEM_EQ_ALL(100, BENCH_MEM_EQ_LOOP, 500) +BENCH_MEM_EQ_ALL(10000, BENCH_MEM_EQ_LOOP, 500) +BENCH_MEM_EQ_ALL(100000, BENCH_MEM_EQ_LOOP, 500) +BENCH_MEM_EQ_ALL(1000000, BENCH_MEM_EQ_LOOP, 200) + +#define BENCH_MEM_CMP_LOOP 20 + +BENCH_MEM_CMP_ALL(2, BENCH_MEM_CMP_LOOP, 2000) +BENCH_MEM_CMP_ALL(13, BENCH_MEM_CMP_LOOP, 2000) +BENCH_MEM_CMP_ALL(65, BENCH_MEM_CMP_LOOP, 2000) +BENCH_MEM_CMP_ALL(100, BENCH_MEM_CMP_LOOP, 500) +BENCH_MEM_CMP_ALL(10000, BENCH_MEM_CMP_LOOP, 500) +BENCH_MEM_CMP_ALL(100000, BENCH_MEM_CMP_LOOP, 500) +BENCH_MEM_CMP_ALL(1000000, BENCH_MEM_CMP_LOOP, 200) BENCH_MEM_STRSTR_ALL(512, 1024, 1, 100); BENCH_MEM_STRSTR_ALL(512, 1024, 7, 100); diff --git a/libs/libcommon/src/tests/bench_memcpy.cpp b/libs/libcommon/src/tests/bench_memcpy.cpp new file mode 100644 index 00000000000..4028dc9ae6a --- /dev/null +++ b/libs/libcommon/src/tests/bench_memcpy.cpp @@ -0,0 +1,139 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../libmemcpy/folly/FollyMemcpy.h" +#include "../../libmemcpy/memcpy.h" + +namespace bench +{ + +ALWAYS_INLINE static inline void * tiflash_internal_memcpy(void * __restrict dst, const void * __restrict src, size_t size) +{ + return sse2_inline_memcpy(dst, src, size); +} + +template +class MemUtilsCopy : public benchmark::Fixture +{ +protected: + static const size_t loop_cnt = loop_cnt_; + + std::string dst_buffer; + std::string src_buffer; + std::vector sizes; + std::vector dst_offsets; + std::vector src_offsets; + + void SetUp(const ::benchmark::State & /*state*/) override + { + size_t src_buffer_size = (sysconf(_SC_PAGE_SIZE) * std::ceil(static_cast(max + 2 * align) / sysconf(_SC_PAGE_SIZE))); + size_t dst_buffer_size; + if (hot) + { + dst_buffer_size = src_buffer_size; + } + else + { + dst_buffer_size = 1024 * 1024 * 1024; // 1 GiB + } + dst_buffer.resize(dst_buffer_size); + memset(dst_buffer.data(), 'd', dst_buffer.size()); + src_buffer.resize(src_buffer_size); + memset(src_buffer.data(), 's', src_buffer.size()); + + std::default_random_engine gen; + sizes.resize(4095); + std::uniform_int_distribution size_dist(min, max); + for (auto & size : sizes) + { + size = size_dist(gen); + } + + src_offsets.resize(4096); + dst_offsets.resize(4096); + std::uniform_int_distribution src_offset_dist( + 0, + (src_buffer_size - max) / align); + std::uniform_int_distribution dst_offset_dist( + 0, + (dst_buffer_size - max) / align); + for (size_t i = 0; i < src_offsets.size(); i++) + { + src_offsets[i] = align * src_offset_dist(gen); + dst_offsets[i] = align * dst_offset_dist(gen); + } + } +}; + +#define BENCH_MEM_COPY(id, name, fn_memcpy, iters) \ + BENCHMARK_DEFINE_F(id, name) \ + (benchmark::State & state) \ + { \ + for (auto _ : state) \ + { \ + size_t size_idx = 0; \ + size_t offset_idx = 0; \ + for (unsigned int i = 0; i < loop_cnt; i++) \ + { \ + if (size_idx + 1 == sizes.size()) \ + size_idx = 0; \ + if (offset_idx >= src_offsets.size()) \ + offset_idx = 0; \ + void * dst = &dst_buffer[dst_offsets[offset_idx]]; \ + const void * src = &src_buffer[src_offsets[offset_idx]]; \ + volatile size_t size = sizes[size_idx]; \ + fn_memcpy(dst, src, size); \ + size_idx++; \ + offset_idx++; \ + } \ + } \ + } \ + BENCHMARK_REGISTER_F(id, name)->Iterations(iters); + +#define BENCH_MEM_COPY_ALL_IMPL(id, min, max, align, hot, loop_cnt, iters) \ + using id = MemUtilsCopy; \ + BENCH_MEM_COPY(id, stl_mempy, std::memcpy, iters) \ + BENCH_MEM_COPY(id, inline_clickhouse_memcpy, legacy::inline_memcpy, iters) \ + BENCH_MEM_COPY(id, inline_tiflash_memcpy, tiflash_internal_memcpy, iters) \ + BENCH_MEM_COPY(id, folly_memcpy, __folly_memcpy, iters) + +#define BENCH_MEM_IMPL_ID(min, max, align, hot, loop_cnt) MemUtilsCopy##_##min##_##max##_##align##_##hot##_##loop_cnt + +#define BENCH_MEM_COPY_ALL(min, max, align, hot, loop_cnt, iters) \ + BENCH_MEM_COPY_ALL_IMPL(BENCH_MEM_IMPL_ID(min, max, align, hot, loop_cnt), min, max, align, hot, loop_cnt, iters) + +BENCH_MEM_COPY_ALL(1, 20, 3, true, 20000, 500); +BENCH_MEM_COPY_ALL(1, 40, 3, true, 20000, 500); +BENCH_MEM_COPY_ALL(1, 80, 3, true, 20000, 500); +BENCH_MEM_COPY_ALL(1, 200, 3, true, 20000, 500); +BENCH_MEM_COPY_ALL(1, 1000, 3, true, 20000, 500); + + +} // namespace bench \ No newline at end of file diff --git a/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp b/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp index 3895577dca5..49b04ecbb42 100644 --- a/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp +++ b/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -25,6 +26,8 @@ #include #include +#include "../../libmemcpy/folly/FollyMemcpy.h" + #if defined(TIFLASH_ENABLE_AVX_SUPPORT) void TestFunc(size_t size) @@ -152,6 +155,13 @@ TEST(MemUtilsTestOPT, CompareNormal) std::string b(2, char(1)); ASSERT_EQ(-1, mem_utils::StrFind({start, size}, b)); + ASSERT_EQ(-1, + mem_utils::avx2_strstr(start, size, b.data(), b.size())); + } + { + std::string a(32, char(0)); + char * p = a.data() + 16 - size_t(a.data()) % 16 + 5; + ASSERT_EQ(nullptr, mem_utils::avx2_memchr(p, 5, char(1))); } } @@ -172,3 +182,63 @@ TEST(MemUtilsTestOPT, CompareStr) } #endif + +template +void TestMemCopyFunc(size_t size, F && fn_memcpy) +{ + std::string oa(size + 100, 0); + char * start = oa.data(); + start += (16 - size_t(start) % 16); + start += 5; + { + uint8_t n1 = 1, n2 = 2; + for (auto * p = start; p != start + size; ++p) + { + *p = n1 + n2; + n1 = n2; + n2 = *p; + } + } + + std::string ob; + char * tar{}; + + if constexpr (overlap_offset) + tar = start + overlap_offset; + else + { + ob.resize(size + 100, 0); + tar = ob.data(); + tar += (16 - size_t(tar) % 16); + tar += 1; + } + + fn_memcpy(tar, start, size); + { + uint8_t n1 = 1, n2 = 2; + for (const auto * p = tar; p != tar + size; ++p) + { + ASSERT_EQ(uint8_t(*p), uint8_t(n1 + n2)); + n1 = n2; + n2 = *p; + } + } +} + +#if defined(__SSE2__) + +TEST(MemUtilsTestOPT, Memcopy) +{ + for (size_t size = 0; size < 256; ++size) + { + TestMemCopyFunc<0>(size, __folly_memcpy); + { + // test memmove + TestMemCopyFunc<3>(size, __folly_memcpy); + TestMemCopyFunc<-3>(size, __folly_memcpy); + } + TestMemCopyFunc<0>(size, inline_memcpy); + } +} + +#endif \ No newline at end of file diff --git a/libs/libmemcpy/CMakeLists.txt b/libs/libmemcpy/CMakeLists.txt index 2efd2cb78e4..a329d64bb49 100644 --- a/libs/libmemcpy/CMakeLists.txt +++ b/libs/libmemcpy/CMakeLists.txt @@ -12,5 +12,36 @@ # See the License for the specific language governing permissions and # limitations under the License. -add_library (memcpy STATIC memcpy.cpp) +option(TIFLASH_FOLLY_MEMCPY_IS_MEMCPY "use folly memcpy as default `memcpy` and `memmove`" ON) + +set (memcpy_sources) + +# only enbale folly memcpy under linux x86_64 with avx2 supported +if (ARCH_LINUX AND TIFLASH_ENABLE_AVX_SUPPORT) + set_property(SOURCE folly/memcpy.S PROPERTY LANGUAGE CXX) + set_property(SOURCE folly/memcpy.S APPEND PROPERTY COMPILE_OPTIONS "-x" "assembler-with-cpp") + set_property(SOURCE folly/memcpy.S APPEND PROPERTY COMPILE_FLAGS "-mavx -mavx2") + list (APPEND memcpy_sources folly/memcpy.S) + message (STATUS "`libmemcpy` support Folly memcpy") +else () + set (TIFLASH_FOLLY_MEMCPY_IS_MEMCPY OFF) +endif () + +if (USE_INTERNAL_MEMCPY) + if (TIFLASH_FOLLY_MEMCPY_IS_MEMCPY) + message (STATUS "Using Folly memcpy as default `memcpy` and `memmove`") + add_definitions(-DFOLLY_MEMCPY_IS_MEMCPY=1) + else () + message (STATUS "Using internal memcpy") + list (APPEND memcpy_sources memcpy.cpp) + endif () +else () + add_definitions(-DNO_TIFLASH_INTERNAL_MEMCPY=1) + list (APPEND memcpy_sources memcpy.cpp) +endif() + +add_library (memcpy STATIC ${memcpy_sources}) target_include_directories(memcpy PUBLIC ${TiFlash_SOURCE_DIR}/libs/libcommon/include) + +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer") diff --git a/libs/libmemcpy/folly/FollyMemcpy.h b/libs/libmemcpy/folly/FollyMemcpy.h new file mode 100644 index 00000000000..4022d1b4b13 --- /dev/null +++ b/libs/libmemcpy/folly/FollyMemcpy.h @@ -0,0 +1,40 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// @author: Logan Evans + +#include + +extern "C" { +void * __folly_memcpy( + void * __restrict dst, + const void * __restrict src, + size_t size); +} diff --git a/libs/libmemcpy/folly/memcpy.S b/libs/libmemcpy/folly/memcpy.S new file mode 100644 index 00000000000..acdfcf7d18d --- /dev/null +++ b/libs/libmemcpy/folly/memcpy.S @@ -0,0 +1,479 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * __folly_memcpy: An optimized memcpy implementation that uses prefetch and + * AVX2 instructions. + * + * This implementation of memcpy acts as a memmove: while overlapping copies + * are undefined in memcpy, in some implementations they're the same function and + * legacy programs rely on this behavior. + * + * This implementation uses prefetch to avoid dtlb misses. This can + * substantially reduce dtlb store misses in cases where the destination + * location is absent from L1 cache and where the copy size is small enough + * that the hardware prefetcher doesn't have a large impact. + * + * The number of branches is limited by the use of overlapping loads & stores. + * This helps with copies where the source and destination cache lines are already + * present in L1 because there are fewer instructions to execute and fewer + * branches to potentially mispredict. + * e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped): + * movl (%rsi), %r8d + * movl -4(%rsi,%rdx), %r9d + * movl %r8d, (%rdi) + * movl %r9d, -4(%rdi,%rdx) + * + * + * For sizes up to 256 all source data is first read into registers and then written: + * - n <= 16: overlapping movs + * - n <= 32: overlapping unaligned 16-byte SSE XMM load/stores + * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores + * + * Large copies (> 256 bytes) use unaligned loads + aligned stores. + * This is observed to always be faster than rep movsb, so the rep movsb + * instruction is not used. + * - The head & tail may be unaligned => they're always written using unaligned stores. + * + * If the copy size is humongous (> 32 KiB) and the source and destination are both + * aligned, this memcpy will use non-temporal operations (AVX2). This can have + * a substantial speedup for copies where data is absent from L1, but it + * is significantly slower if the source and destination data were already + * in L1. The use of non-temporal operations also has the effect that after + * the copy is complete, the data will be moved out of L1, even if the data was + * present before the copy started. + * + * For n > 256 and overlapping src & dst buffers (memmove): + * - use unaligned loads + aligned stores, but not non-temporal stores + * - for dst < src forward copy in 128 byte batches: + * - unaligned load the first 32 bytes & last 4 x 32 bytes + * - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time + * - unaligned store the first 32 bytes & last 4 x 32 bytes + * - for dst > src backward copy in 128 byte batches: + * - unaligned load the first 4 x 32 bytes & last 32 bytes + * - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time + * - unaligned store the first 4 x 32 bytes & last 32 bytes + * + * @author Logan Evans + */ + +#if defined(__AVX2__) + +#if defined(PREFETCH) +#undef PREFETCH +#endif +#if __PRFCHW__ // Broadwell+ +#define PREFETCH prefetchw +#else +#define PREFETCH prefetcht0 +#endif + +// This threshold is half of L1 cache on a Skylake machine, which means that +// potentially all of L1 will be populated by this copy once it is executed +// (dst and src are cached for temporal copies). +#define NON_TEMPORAL_STORE_THRESHOLD $32768 + + .file "memcpy.S" + .section .text,"ax" + + .type __folly_memcpy_short, @function +__folly_memcpy_short: + .cfi_startproc + +.L_GE1_LE7: + cmp $1, %rdx + je .L_EQ1 + + cmp $4, %rdx + jae .L_GE4_LE7 + +.L_GE2_LE3: + movw (%rsi), %r8w + movw -2(%rsi,%rdx), %r9w + movw %r8w, (%rdi) + movw %r9w, -2(%rdi,%rdx) + ret + + .align 2 +.L_EQ1: + movb (%rsi), %r8b + movb %r8b, (%rdi) + ret + + // Aligning the target of a jump to an even address has a measurable + // speedup in microbenchmarks. + .align 2 +.L_GE4_LE7: + movl (%rsi), %r8d + movl -4(%rsi,%rdx), %r9d + movl %r8d, (%rdi) + movl %r9d, -4(%rdi,%rdx) + ret + + .cfi_endproc + .size __folly_memcpy_short, .-__folly_memcpy_short + +// memcpy is an alternative entrypoint into the function named __folly_memcpy. +// The compiler is able to call memcpy since the name is global while +// stacktraces will show __folly_memcpy since that is the name of the function. +// This is intended to aid in debugging by making it obvious which version of +// memcpy is being used. + .align 64 + .globl __folly_memcpy + .type __folly_memcpy, @function + +__folly_memcpy: + .cfi_startproc + + mov %rdi, %rax # return: $rdi + + test %rdx, %rdx + je .L_EQ0 + + PREFETCH (%rdi) + PREFETCH -1(%rdi,%rdx) + + cmp $8, %rdx + jb .L_GE1_LE7 + +.L_GE8: + cmp $32, %rdx + ja .L_GE33 + +.L_GE8_LE32: + cmp $16, %rdx + ja .L_GE17_LE32 + +.L_GE8_LE16: + mov (%rsi), %r8 + mov -8(%rsi,%rdx), %r9 + mov %r8, (%rdi) + mov %r9, -8(%rdi,%rdx) +.L_EQ0: + ret + + .align 2 +.L_GE17_LE32: + movdqu (%rsi), %xmm0 + movdqu -16(%rsi,%rdx), %xmm1 + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi,%rdx) + ret + + .align 2 +.L_GE193_LE256: + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, -128(%rdi,%rdx) + +.L_GE129_LE192: + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm5, -96(%rdi,%rdx) + +.L_GE65_LE128: + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm6, -64(%rdi,%rdx) + +.L_GE33_LE64: + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm7, -32(%rdi,%rdx) + + vzeroupper + ret + + .align 2 +.L_GE33: + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm7 + + cmp $64, %rdx + jbe .L_GE33_LE64 + + PREFETCH 64(%rdi) + + vmovdqu 32(%rsi), %ymm1 + vmovdqu -64(%rsi,%rdx), %ymm6 + + cmp $128, %rdx + jbe .L_GE65_LE128 + + PREFETCH 128(%rdi) + + vmovdqu 64(%rsi), %ymm2 + vmovdqu -96(%rsi,%rdx), %ymm5 + + cmp $192, %rdx + jbe .L_GE129_LE192 + + PREFETCH 192(%rdi) + + vmovdqu 96(%rsi), %ymm3 + vmovdqu -128(%rsi,%rdx), %ymm4 + + cmp $256, %rdx + jbe .L_GE193_LE256 + +.L_GE257: + PREFETCH 256(%rdi) + + // Check if there is an overlap. If there is an overlap then the caller + // has a bug since this is undefined behavior. However, for legacy + // reasons this behavior is expected by some callers. + // + // All copies through 256 bytes will operate as a memmove since for + // those sizes all reads are performed before any writes. + // + // This check uses the idea that there is an overlap if + // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)), + // or equivalently, there is no overlap if + // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi). + // + // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many + // bytes remain to be copied. + + // (%rsi + %rdx <= %rdi) => no overlap + lea (%rsi,%rdx), %r9 + cmp %rdi, %r9 + jbe .L_NO_OVERLAP + + // (%rdi + %rdx <= %rsi) => no overlap + lea (%rdi,%rdx), %r8 + cmp %rsi, %r8 + // If no info is available in branch predictor's cache, Intel CPUs assume + // forward jumps are not taken. Use a forward jump as overlapping buffers + // are unlikely. + ja .L_OVERLAP + + .align 2 +.L_NO_OVERLAP: + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 96(%rdi) + + // Align %rdi to a 32 byte boundary. + // %rcx = 128 - 31 & %rdi + mov $128, %rcx + and $31, %rdi + sub %rdi, %rcx + + lea (%rsi,%rcx), %rsi + lea (%rax,%rcx), %rdi + sub %rcx, %rdx + + // %r8 is the end condition for the loop. + lea -128(%rsi,%rdx), %r8 + + cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx + jae .L_NON_TEMPORAL_LOOP + + .align 2 +.L_ALIGNED_DST_LOOP: + PREFETCH 128(%rdi) + PREFETCH 192(%rdi) + + vmovdqu (%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + add $128, %rsi + + vmovdqa %ymm0, (%rdi) + vmovdqa %ymm1, 32(%rdi) + vmovdqa %ymm2, 64(%rdi) + vmovdqa %ymm3, 96(%rdi) + add $128, %rdi + + cmp %r8, %rsi + jb .L_ALIGNED_DST_LOOP + +.L_ALIGNED_DST_LOOP_END: + sub %rsi, %r9 + mov %r9, %rdx + + vmovdqu %ymm4, -128(%rdi,%rdx) + vmovdqu %ymm5, -96(%rdi,%rdx) + vmovdqu %ymm6, -64(%rdi,%rdx) + vmovdqu %ymm7, -32(%rdi,%rdx) + + vzeroupper + ret + + .align 2 +.L_NON_TEMPORAL_LOOP: + testb $31, %sil + jne .L_ALIGNED_DST_LOOP + // This is prefetching the source data unlike ALIGNED_DST_LOOP which + // prefetches the destination data. This choice is again informed by + // benchmarks. With a non-temporal store the entirety of the cache line + // is being written so the previous data can be discarded without being + // fetched. + prefetchnta 128(%rsi) + prefetchnta 196(%rsi) + + vmovntdqa (%rsi), %ymm0 + vmovntdqa 32(%rsi), %ymm1 + vmovntdqa 64(%rsi), %ymm2 + vmovntdqa 96(%rsi), %ymm3 + add $128, %rsi + + vmovntdq %ymm0, (%rdi) + vmovntdq %ymm1, 32(%rdi) + vmovntdq %ymm2, 64(%rdi) + vmovntdq %ymm3, 96(%rdi) + add $128, %rdi + + cmp %r8, %rsi + jb .L_NON_TEMPORAL_LOOP + + sfence + jmp .L_ALIGNED_DST_LOOP_END + + +.L_OVERLAP: + .align 2 + cmp %rdi, %rsi + jb .L_OVERLAP_BWD // %rsi < %rdi => backward-copy + je .L_RET // %rsi == %rdi => return, nothing to copy + + // Source & destination buffers overlap. Forward copy. + + vmovdqu (%rsi), %ymm8 + + // Align %rdi to a 32 byte boundary. + // %rcx = 32 - 31 & %rdi + mov $32, %rcx + and $31, %rdi + sub %rdi, %rcx + + lea (%rsi,%rcx), %rsi + lea (%rax,%rcx), %rdi + sub %rcx, %rdx + + // %r8 is the end condition for the loop. + lea -128(%rsi,%rdx), %r8 + + +.L_OVERLAP_FWD_ALIGNED_DST_LOOP: + PREFETCH 128(%rdi) + PREFETCH 192(%rdi) + + vmovdqu (%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + add $128, %rsi + + vmovdqa %ymm0, (%rdi) + vmovdqa %ymm1, 32(%rdi) + vmovdqa %ymm2, 64(%rdi) + vmovdqa %ymm3, 96(%rdi) + add $128, %rdi + + cmp %r8, %rsi + jb .L_OVERLAP_FWD_ALIGNED_DST_LOOP + + sub %rsi, %r9 + mov %r9, %rdx + + vmovdqu %ymm4, -128(%rdi,%rdx) + vmovdqu %ymm5, -96(%rdi,%rdx) + vmovdqu %ymm6, -64(%rdi,%rdx) + vmovdqu %ymm7, -32(%rdi,%rdx) + vmovdqu %ymm8, (%rax) // %rax == the original (unaligned) %rdi + + vzeroupper + +.L_RET: + ret + +.L_OVERLAP_BWD: + # Save last 32 bytes. + vmovdqu -32(%rsi, %rdx), %ymm8 + lea -32(%rdi, %rdx), %r9 + + + // %r8 is the end condition for the loop. + lea 128(%rsi), %r8 + + // Align %rdi+%rdx (destination end) to a 32 byte boundary. + // %rcx = (%rdi + %rdx - 32) & 31 + mov %r9, %rcx + and $31, %rcx + // Set %rsi & %rdi to the end of the 32 byte aligned range. + sub %rcx, %rdx + add %rdx, %rsi + add %rdx, %rdi + + +.L_OVERLAP_BWD_ALIGNED_DST_LOOP: + PREFETCH -128(%rdi) + PREFETCH -192(%rdi) + + vmovdqu -32(%rsi), %ymm4 + vmovdqu -64(%rsi), %ymm5 + vmovdqu -96(%rsi), %ymm6 + vmovdqu -128(%rsi), %ymm7 + sub $128, %rsi + + vmovdqa %ymm4, -32(%rdi) + vmovdqa %ymm5, -64(%rdi) + vmovdqa %ymm6, -96(%rdi) + vmovdqa %ymm7, -128(%rdi) + sub $128, %rdi + + cmp %r8, %rsi + ja .L_OVERLAP_BWD_ALIGNED_DST_LOOP + + vmovdqu %ymm0, (%rax) // %rax == the original unaligned %rdi + vmovdqu %ymm1, 32(%rax) + vmovdqu %ymm2, 64(%rax) + vmovdqu %ymm3, 96(%rax) + vmovdqu %ymm8, (%r9) + + vzeroupper + ret + + .cfi_endproc + .size __folly_memcpy, .-__folly_memcpy + +#ifdef FOLLY_MEMCPY_IS_MEMCPY + .weak memcpy + memcpy = __folly_memcpy + + .weak memmove + memmove = __folly_memcpy +#endif + + .ident "GCC: (GNU) 4.8.2" + +#endif +#ifdef __linux__ + .section .note.GNU-stack,"",@progbits +#endif diff --git a/libs/libmemcpy/memcpy.cpp b/libs/libmemcpy/memcpy.cpp index e32ce6ea441..92fe2389b27 100644 --- a/libs/libmemcpy/memcpy.cpp +++ b/libs/libmemcpy/memcpy.cpp @@ -12,11 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "memcpy.h" +#ifndef NO_TIFLASH_INTERNAL_MEMCPY + +#if defined(__SSE2__) + +#include /// This is needed to generate an object file for linking. extern "C" __attribute__((visibility("default"))) void * memcpy(void * __restrict dst, const void * __restrict src, size_t size) { - return inline_memcpy(dst, src, size); + return sse2_inline_memcpy(dst, src, size); } + +#endif + +#endif \ No newline at end of file diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h index a8bd69967f5..e25e16a6f13 100644 --- a/libs/libmemcpy/memcpy.h +++ b/libs/libmemcpy/memcpy.h @@ -18,6 +18,9 @@ #include #include +namespace legacy +{ + /** Custom memcpy implementation for ClickHouse. * It has the following benefits over using glibc's implementation: * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability. @@ -226,3 +229,4 @@ ALWAYS_INLINE static inline void * inline_memcpy(void * __restrict dst_, const v } return ret; } +} // namespace legacy \ No newline at end of file