diff --git a/CMakeLists.txt b/CMakeLists.txt
index e989427d99a..d7c00eb2e9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -511,6 +511,11 @@ if (ARCH_AMD64)
     else()
         add_definitions(-DTIFLASH_COMPILER_VPCLMULQDQ_SUPPORT=0)
     endif()
+
+    check_cxx_compiler_flag("-mmovbe" TIFLASH_COMPILER_MOVBE_SUPPORT)
+    if (TIFLASH_COMPILER_MOVBE_SUPPORT)
+        set(COMPILER_MOVBE_FLAG "-mmovbe")
+    endif()
 else()
     add_definitions(-DTIFLASH_COMPILER_VPCLMULQDQ_SUPPORT=0)
 endif()
diff --git a/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
index 9ce7060a9eb..5cfe7ded022 100644
--- a/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@@ -193,28 +193,28 @@ struct SingleValueDataString
 
     Int32 size = -1; /// -1 indicates that there is no value.
     Int32 capacity = 0; /// power of two or zero
-    char * large_data;
-    TiDB::TiDBCollatorPtr collator = nullptr;
+    char * large_data{};
+    TiDB::TiDBCollatorPtr collator{};
 
     bool less(const StringRef & a, const StringRef & b) const
     {
-        if (collator == nullptr)
+        if (unlikely(collator == nullptr))
             return a < b;
-        return collator->compare(a.data, a.size, b.data, b.size) < 0;
+        return collator->compareFastPath(a.data, a.size, b.data, b.size) < 0;
     }
 
     bool greater(const StringRef & a, const StringRef & b) const
     {
-        if (collator == nullptr)
+        if (unlikely(collator == nullptr))
             return a > b;
-        return collator->compare(a.data, a.size, b.data, b.size) > 0;
+        return collator->compareFastPath(a.data, a.size, b.data, b.size) > 0;
     }
 
     bool equalTo(const StringRef & a, const StringRef & b) const
     {
-        if (collator == nullptr)
+        if (unlikely(collator == nullptr))
             return a == b;
-        return collator->compare(a.data, a.size, b.data, b.size) == 0;
+        return collator->compareFastPath(a.data, a.size, b.data, b.size) == 0;
     }
 
 public:
@@ -222,7 +222,7 @@ struct SingleValueDataString
     static constexpr Int32 MAX_SMALL_STRING_SIZE = AUTOMATIC_STORAGE_SIZE - sizeof(size) - sizeof(capacity) - sizeof(large_data) - sizeof(collator);
 
 private:
-    char small_data[MAX_SMALL_STRING_SIZE]; /// Including the terminating zero.
+    char small_data[MAX_SMALL_STRING_SIZE]{}; /// Including the terminating zero.
 
 public:
     bool has() const
diff --git a/dbms/src/AggregateFunctions/IAggregateFunction.h b/dbms/src/AggregateFunctions/IAggregateFunction.h
index 646be9c928f..4bf308dc21f 100644
--- a/dbms/src/AggregateFunctions/IAggregateFunction.h
+++ b/dbms/src/AggregateFunctions/IAggregateFunction.h
@@ -439,7 +439,7 @@ struct AggregationCollatorsWrapper<true>
         if (likely(collators.size() > column_index))
         {
             if (collators[column_index] != nullptr)
-                return collators[column_index]->sortKey(in.data, in.size, sort_key_containers[column_index]);
+                return collators[column_index]->sortKeyFastPath(in.data, in.size, sort_key_containers[column_index]);
             return in;
         }
         else if (collators.empty())
diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp
index 5b184ef8983..990e5c80290 100644
--- a/dbms/src/Columns/ColumnString.cpp
+++ b/dbms/src/Columns/ColumnString.cpp
@@ -18,6 +18,7 @@
 #include <Common/HashTable/Hash.h>
 #include <DataStreams/ColumnGathererStream.h>
 #include <Storages/Transaction/CollatorUtils.h>
+#include <common/memcpy.h>
 #include <fmt/core.h>
 
 
@@ -94,7 +95,7 @@ void ColumnString::insertRangeFrom(const IColumn & src, size_t start, size_t len
 
     size_t old_chars_size = chars.size();
     chars.resize(old_chars_size + nested_length);
-    memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
+    inline_memcpy(&chars[old_chars_size], &src_concrete.chars[nested_offset], nested_length);
 
     if (start == 0 && offsets.empty())
     {
@@ -315,12 +316,10 @@ int ColumnString::compareAtWithCollationImpl(size_t n, size_t m, const IColumn &
 {
     const auto & rhs = static_cast<const ColumnString &>(rhs_);
 
-    return collator.compare(
-        reinterpret_cast<const char *>(&chars[offsetAt(n)]),
-        sizeAt(n) - 1, // Skip last zero byte.
-        reinterpret_cast<const char *>(&rhs.chars[rhs.offsetAt(m)]),
-        rhs.sizeAt(m) - 1 // Skip last zero byte.
-    );
+    auto a = getDataAt(n);
+    auto b = rhs.getDataAt(m);
+
+    return collator.compare(a.data, a.size, b.data, b.size);
 }
 
 // Derived must implement function `int compare(const char *, size_t, const char *, size_t)`.
diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h
index 2204319e090..ffea54625e9 100644
--- a/dbms/src/Columns/ColumnString.h
+++ b/dbms/src/Columns/ColumnString.h
@@ -211,10 +211,10 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
 
         StringRef res;
 
-        if (collator != nullptr)
+        if (likely(collator != nullptr))
         {
             // Skip last zero byte.
-            auto sort_key = collator->sortKey(reinterpret_cast<const char *>(src), string_size - 1, sort_key_container);
+            auto sort_key = collator->sortKeyFastPath(reinterpret_cast<const char *>(src), string_size - 1, sort_key_container);
             string_size = sort_key.size;
             src = sort_key.data;
         }
@@ -244,10 +244,10 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
     {
         size_t string_size = sizeAt(n);
         size_t offset = offsetAt(n);
-        if (collator != nullptr)
+        if (likely(collator != nullptr))
         {
             // Skip last zero byte.
-            auto sort_key = collator->sortKey(reinterpret_cast<const char *>(&chars[offset]), string_size - 1, sort_key_container);
+            auto sort_key = collator->sortKeyFastPath(reinterpret_cast<const char *>(&chars[offset]), string_size - 1, sort_key_container);
             string_size = sort_key.size;
             hash.update(reinterpret_cast<const char *>(&string_size), sizeof(string_size));
             hash.update(sort_key.data, sort_key.size);
@@ -278,16 +278,7 @@ class ColumnString final : public COWPtrHelper<IColumn, ColumnString>
     int compareAt(size_t n, size_t m, const IColumn & rhs_, int /*nan_direction_hint*/) const override
     {
         const auto & rhs = static_cast<const ColumnString &>(rhs_);
-
-        const size_t size = sizeAt(n);
-        const size_t rhs_size = rhs.sizeAt(m);
-
-        int cmp = memcmp(&chars[offsetAt(n)], &rhs.chars[rhs.offsetAt(m)], std::min(size, rhs_size));
-
-        if (cmp != 0)
-            return cmp;
-        else
-            return size > rhs_size ? 1 : (size < rhs_size ? -1 : 0);
+        return getDataAtWithTerminatingZero(n).compare(rhs.getDataAtWithTerminatingZero(m));
     }
 
     int compareAt(size_t n, size_t m, const IColumn & rhs_, int, const ICollator & collator) const override
diff --git a/dbms/src/Columns/ColumnsCommon.cpp b/dbms/src/Columns/ColumnsCommon.cpp
index 0cfa6eeae29..da468c86505 100644
--- a/dbms/src/Columns/ColumnsCommon.cpp
+++ b/dbms/src/Columns/ColumnsCommon.cpp
@@ -12,13 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if __SSE2__
-#include <emmintrin.h>
-#endif
-
 #include <Columns/ColumnsCommon.h>
 #include <Columns/IColumn.h>
-
+#include <common/memcpy.h>
 
 namespace DB
 {
@@ -146,7 +142,7 @@ struct ResultOffsetsBuilder
     {
         const auto offsets_size_old = res_offsets.size();
         res_offsets.resize(offsets_size_old + SIMD_BYTES);
-        memcpy(&res_offsets[offsets_size_old], src_offsets_pos, SIMD_BYTES * sizeof(IColumn::Offset));
+        inline_memcpy(&res_offsets[offsets_size_old], src_offsets_pos, SIMD_BYTES * sizeof(IColumn::Offset));
 
         if (!first)
         {
@@ -194,7 +190,7 @@ void filterArraysImplGeneric(
 {
     const size_t size = src_offsets.size();
     if (size != filt.size())
-        throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
+        throw Exception(fmt::format("size of filter {} doesn't match size of column {}", filt.size(), size), ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
 
     ResultOffsetsBuilder result_offsets_builder(res_offsets);
 
@@ -223,7 +219,7 @@ void filterArraysImplGeneric(
 
         const auto elems_size_old = res_elems.size();
         res_elems.resize(elems_size_old + size);
-        memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T));
+        inline_memcpy(&res_elems[elems_size_old], &src_elems[offset], size * sizeof(T));
     };
 
 #if __SSE2__
@@ -233,7 +229,7 @@ void filterArraysImplGeneric(
 
     while (filt_pos < filt_end_aligned)
     {
-        const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8(
+        uint32_t mask = _mm_movemask_epi8(_mm_cmpgt_epi8(
             _mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)),
             zero_vec));
 
@@ -254,13 +250,16 @@ void filterArraysImplGeneric(
             /// copy elements for SIMD_BYTES arrays at once
             const auto elems_size_old = res_elems.size();
             res_elems.resize(elems_size_old + chunk_size);
-            memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
+            inline_memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
         }
         else
         {
-            for (size_t i = 0; i < SIMD_BYTES; ++i)
-                if (filt_pos[i])
-                    copy_array(offsets_pos + i);
+            while (mask)
+            {
+                size_t index = __builtin_ctz(mask);
+                copy_array(offsets_pos + index);
+                mask = mask & (mask - 1);
+            }
         }
 
         filt_pos += SIMD_BYTES;
diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h
index 525a7f5ab4d..dbf50175007 100644
--- a/dbms/src/Common/ColumnsHashing.h
+++ b/dbms/src/Common/ColumnsHashing.h
@@ -25,6 +25,7 @@
 #include <Core/Defines.h>
 #include <Interpreters/AggregationCommon.h>
 #include <Storages/Transaction/Collator.h>
+#include <common/memcpy.h>
 #include <common/unaligned.h>
 
 #include <memory>
@@ -72,7 +73,7 @@ struct HashMethodOneNumber
     using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t
 
     /// Is used for default implementation in HashMethodBase.
-    FieldType getKeyHolder(size_t row, Arena *, std::vector<String> &) const
+    ALWAYS_INLINE inline FieldType getKeyHolder(size_t row, Arena *, std::vector<String> &) const
     {
         if constexpr (std::is_same_v<FieldType, Int256>)
             return vec[row];
@@ -99,7 +100,7 @@ struct HashMethodString
     HashMethodString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators)
     {
         const IColumn & column = *key_columns[0];
-        const ColumnString & column_string = assert_cast<const ColumnString &>(column);
+        const auto & column_string = assert_cast<const ColumnString &>(column);
         offsets = column_string.getOffsets().data();
         chars = column_string.getChars().data();
         if (!collators.empty())
@@ -110,15 +111,15 @@ struct HashMethodString
         }
     }
 
-    auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector<String> & sort_key_containers) const
+    ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena * pool, std::vector<String> & sort_key_containers) const
     {
         auto last_offset = row == 0 ? 0 : offsets[row - 1];
         StringRef key(chars + last_offset, offsets[row] - last_offset - 1);
 
         if constexpr (place_string_to_arena)
         {
-            if (collator)
-                key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
+            if (likely(collator))
+                key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]);
             return ArenaKeyHolder{key, *pool};
         }
         else
@@ -127,6 +128,98 @@ struct HashMethodString
         }
     }
 
+protected:
+    friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+};
+
+/// For the case when there is multi string key.
+template <typename Value, typename Mapped>
+struct HashMethodMultiString
+    : public columns_hashing_impl::HashMethodBase<HashMethodMultiString<Value, Mapped>, Value, Mapped, false>
+{
+    using Self = HashMethodMultiString<Value, Mapped>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
+
+    std::vector<const IColumn::Offset *> offsets;
+    std::vector<const UInt8 *> chars;
+    TiDB::TiDBCollators collators;
+    bool all_collators_padding_bin = false;
+
+    HashMethodMultiString(const ColumnRawPtrs & key_columns, const Sizes &, const TiDB::TiDBCollators & collators_)
+        : collators(collators_)
+    {
+        size_t num = key_columns.size();
+        offsets.resize(num);
+        chars.resize(num);
+
+        for (size_t i = 0; i < num; ++i)
+        {
+            const IColumn & column = *key_columns[i];
+            const auto & column_string = assert_cast<const ColumnString &>(column);
+            offsets[i] = column_string.getOffsets().data();
+            chars[i] = column_string.getChars().data();
+        }
+        if (!collators.empty())
+        {
+            all_collators_padding_bin = std::all_of(collators.begin(), collators.end(), [](auto & x) {
+                return x->isPaddingBinary();
+            });
+        }
+    }
+
+    template <typename F>
+    ALWAYS_INLINE inline SerializedKeyHolder genSerializedKeyHolder(ssize_t row, Arena * pool, F && fn_handle_key) const
+    {
+        auto num = offsets.size();
+
+        static_assert(std::is_same_v<size_t, decltype(reinterpret_cast<const StringRef *>(0)->size)>);
+
+        const char * begin = nullptr;
+        size_t sum_size = 0;
+
+        for (size_t key_index = 0; key_index < num; ++key_index)
+        {
+            auto last_offset = row == 0 ? 0 : offsets[key_index][row - 1];
+            StringRef key(chars[key_index] + last_offset, offsets[key_index][row] - last_offset - 1);
+
+            key = fn_handle_key(key_index, key);
+
+            char * pos = pool->allocContinue(key.size + sizeof(key.size), begin);
+            {
+                memcpy(pos, &key.size, sizeof(key.size));
+                inline_memcpy(pos + sizeof(key.size), key.data, key.size);
+            }
+
+            sum_size += key.size + sizeof(key.size);
+        }
+        return SerializedKeyHolder{{begin, sum_size}, *pool};
+    }
+
+    ALWAYS_INLINE inline auto getKeyHolder(ssize_t row, Arena * pool, std::vector<String> & sort_key_containers) const
+    {
+        if (likely(all_collators_padding_bin))
+        {
+            return genSerializedKeyHolder(row, pool, [](size_t, StringRef key) {
+                return DB::BinCollatorSortKey<true>(key.data, key.size);
+            });
+        }
+
+        if (unlikely(collators.empty()))
+        {
+            return genSerializedKeyHolder(row, pool, [](size_t, StringRef key) {
+                return key;
+            });
+        }
+        else
+        {
+            return genSerializedKeyHolder(row, pool, [&](size_t key_index, StringRef key) {
+                if (collators[key_index])
+                    return collators[key_index]->sortKey(key.data, key.size, sort_key_containers[key_index]);
+                return key;
+            });
+        }
+    }
+
 protected:
     friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
 };
@@ -147,20 +240,20 @@ struct HashMethodFixedString
     HashMethodFixedString(const ColumnRawPtrs & key_columns, const Sizes & /*key_sizes*/, const TiDB::TiDBCollators & collators)
     {
         const IColumn & column = *key_columns[0];
-        const ColumnFixedString & column_string = assert_cast<const ColumnFixedString &>(column);
+        const auto & column_string = assert_cast<const ColumnFixedString &>(column);
         n = column_string.getN();
         chars = &column_string.getChars();
         if (!collators.empty())
             collator = collators[0];
     }
 
-    auto getKeyHolder(size_t row, [[maybe_unused]] Arena * pool, std::vector<String> & sort_key_containers) const
+    ALWAYS_INLINE inline auto getKeyHolder(size_t row, [[maybe_unused]] Arena * pool, std::vector<String> & sort_key_containers) const
     {
         StringRef key(&(*chars)[row * n], n);
 
         if (collator)
         {
-            key = collator->sortKey(key.data, key.size, sort_key_containers[0]);
+            key = collator->sortKeyFastPath(key.data, key.size, sort_key_containers[0]);
         }
 
         if constexpr (place_string_to_arena)
@@ -280,7 +373,7 @@ struct HashMethodKeysFixed
 #endif
     }
 
-    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector<String> &) const
+    ALWAYS_INLINE inline Key getKeyHolder(size_t row, Arena *, std::vector<String> &) const
     {
         if constexpr (has_nullable_keys)
         {
@@ -357,7 +450,7 @@ struct HashMethodSerialized
         , collators(collators_)
     {}
 
-    ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena * pool, std::vector<String> & sort_key_containers) const
+    ALWAYS_INLINE inline SerializedKeyHolder getKeyHolder(size_t row, Arena * pool, std::vector<String> & sort_key_containers) const
     {
         return SerializedKeyHolder{
             serializeKeysToPoolContiguous(row, keys_size, key_columns, collators, sort_key_containers, *pool),
@@ -385,7 +478,7 @@ struct HashMethodHashed
         , collators(collators_)
     {}
 
-    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena *, std::vector<String> & sort_key_containers) const
+    ALWAYS_INLINE inline Key getKeyHolder(size_t row, Arena *, std::vector<String> & sort_key_containers) const
     {
         return hash128(row, key_columns.size(), key_columns, collators, sort_key_containers);
     }
diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h
index 8d8da4318e8..fff3d7a9801 100644
--- a/dbms/src/Common/ColumnsHashingImpl.h
+++ b/dbms/src/Common/ColumnsHashingImpl.h
@@ -128,21 +128,21 @@ class HashMethodBase
     using Cache = LastElementCache<Value, consecutive_keys_optimization>;
 
     template <typename Data>
-    ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
+    ALWAYS_INLINE inline EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         return emplaceImpl(key_holder, data);
     }
 
     template <typename Data>
-    ALWAYS_INLINE FindResult findKey(Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
+    ALWAYS_INLINE inline FindResult findKey(Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         return findKeyImpl(keyHolderGetKey(key_holder), data);
     }
 
     template <typename Data>
-    ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
+    ALWAYS_INLINE inline size_t getHash(const Data & data, size_t row, Arena & pool, std::vector<String> & sort_key_containers)
     {
         auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, &pool, sort_key_containers);
         return data.hash(keyHolderGetKey(key_holder));
@@ -167,7 +167,7 @@ class HashMethodBase
     }
 
     template <typename Data, typename KeyHolder>
-    ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
+    ALWAYS_INLINE inline EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
     {
         if constexpr (Cache::consecutive_keys_optimization)
         {
@@ -220,7 +220,7 @@ class HashMethodBase
     }
 
     template <typename Data, typename Key>
-    ALWAYS_INLINE FindResult findKeyImpl(Key key, Data & data)
+    ALWAYS_INLINE inline FindResult findKeyImpl(Key key, Data & data)
     {
         if constexpr (Cache::consecutive_keys_optimization)
         {
diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt
index eb45bd772d6..39d3e687721 100644
--- a/dbms/src/Functions/CMakeLists.txt
+++ b/dbms/src/Functions/CMakeLists.txt
@@ -20,7 +20,7 @@ add_headers_and_sources(clickhouse_functions ./Conditional)
 add_headers_and_sources(clickhouse_functions ${TiFlash_BINARY_DIR}/dbms/src/Functions)
 
 if (TIFLASH_ENABLE_AVX_SUPPORT)
-    set_source_files_properties(CollationStringOptimized.cpp APPEND COMPILE_FLAGS "-mavx -mavx2")
+    set_source_files_properties(CollationStringOptimized.cpp APPEND COMPILE_FLAGS "-mavx -mavx2 ${COMPILER_MOVBE_FLAG}")
 endif ()
 
 list(REMOVE_ITEM clickhouse_functions_sources IFunction.cpp FunctionFactory.cpp FunctionHelpers.cpp)
diff --git a/dbms/src/Functions/FunctionsComparison.h b/dbms/src/Functions/FunctionsComparison.h
index 84364e74369..fda8a113ada 100644
--- a/dbms/src/Functions/FunctionsComparison.h
+++ b/dbms/src/Functions/FunctionsComparison.h
@@ -364,7 +364,7 @@ struct StringComparisonWithCollatorImpl
         size_t a_n = a.size();
         size_t b_n = b.size();
 
-        int res = collator->compare(reinterpret_cast<const char *>(a.data()), a_n, reinterpret_cast<const char *>(b.data()), b_n);
+        int res = collator->compareFastPath(reinterpret_cast<const char *>(a.data()), a_n, reinterpret_cast<const char *>(b.data()), b_n);
         c = Op::apply(res, 0);
     }
 };
diff --git a/dbms/src/Functions/LeastGreatest.h b/dbms/src/Functions/LeastGreatest.h
index 9ca126a2ada..55e76b66524 100644
--- a/dbms/src/Functions/LeastGreatest.h
+++ b/dbms/src/Functions/LeastGreatest.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include <Columns/Collator.h>
+
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnVector.h>
diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp
index 6cb947a1bfa..f5311a6ab0f 100644
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@@ -32,6 +32,7 @@
 #include <Encryption/WriteBufferFromFileProvider.h>
 #include <IO/CompressedWriteBuffer.h>
 #include <Interpreters/Aggregator.h>
+#include <Storages/Transaction/CollatorUtils.h>
 #include <common/demangle.h>
 
 #include <future>
@@ -303,6 +304,18 @@ AggregatedDataVariants::Type Aggregator::chooseAggregationMethod()
     if (params.keys_size == 1 && types_removed_nullable[0]->isString())
         return AggregatedDataVariants::Type::key_string;
 
+    if (params.keys_size > 1 && types_removed_nullable[0]->isString())
+    {
+        bool is_all_str = std::all_of(types_removed_nullable.data(), types_removed_nullable.data() + params.keys_size, [](const auto & x) {
+            return x->isString();
+        });
+
+        if (is_all_str)
+        {
+            return AggregatedDataVariants::Type::multi_key_string;
+        }
+    }
+
     if (params.keys_size == 1 && types_removed_nullable[0]->isFixedString())
         return AggregatedDataVariants::Type::key_fixed_string;
 
@@ -361,7 +374,7 @@ void NO_INLINE Aggregator::executeImpl(
 }
 
 template <bool no_more_keys, typename Method>
-void NO_INLINE Aggregator::executeImplBatch(
+ALWAYS_INLINE void Aggregator::executeImplBatch(
     Method & method,
     typename Method::State & state,
     Arena * aggregates_pool,
diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h
index 052e3dbdcbb..688298cb14d 100644
--- a/dbms/src/Interpreters/Aggregator.h
+++ b/dbms/src/Interpreters/Aggregator.h
@@ -188,7 +188,7 @@ struct AggregationMethodStringNoCache
     AggregationMethodStringNoCache() = default;
 
     template <typename Other>
-    AggregationMethodStringNoCache(const Other & other)
+    explicit AggregationMethodStringNoCache(const Other & other)
         : data(other.data)
     {}
 
@@ -202,6 +202,35 @@ struct AggregationMethodStringNoCache
     }
 };
 
+/// Same as above but without cache
+template <typename TData>
+struct AggregationMethodMultiStringNoCache
+{
+    using Data = TData;
+    using Key = typename Data::key_type;
+    using Mapped = typename Data::mapped_type;
+
+    Data data;
+
+    AggregationMethodMultiStringNoCache() = default;
+
+    template <typename Other>
+    explicit AggregationMethodMultiStringNoCache(const Other & other)
+        : data(other.data)
+    {}
+
+    using State = ColumnsHashing::HashMethodMultiString<typename Data::value_type, Mapped>;
+
+    std::optional<Sizes> shuffleKeyColumns(std::vector<IColumn *> &, const Sizes &) { return {}; }
+
+    static void insertKeyIntoColumns(const StringRef & key, std::vector<IColumn *> & key_columns, const Sizes &, const TiDB::TiDBCollators &)
+    {
+        const auto * pos = key.data;
+        for (auto & key_column : key_columns)
+            pos = static_cast<ColumnString *>(key_column)->deserializeAndInsertFromArena(pos, nullptr);
+    }
+};
+
 /// For the case where there is one fixed-length string key.
 template <typename TData>
 struct AggregationMethodFixedString
@@ -418,6 +447,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>> key64;
     std::unique_ptr<AggregationMethodOneNumber<Int256, AggregatedDataWithInt256Key>> key_int256;
     std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>> key_string;
+    std::unique_ptr<AggregationMethodMultiStringNoCache<AggregatedDataWithStringKey>> multi_key_string;
     std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>> key_fixed_string;
     std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt16Key, false, false>> keys16;
     std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32Key>> keys32;
@@ -430,6 +460,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyTwoLevel>> key64_two_level;
     std::unique_ptr<AggregationMethodOneNumber<Int256, AggregatedDataWithInt256KeyTwoLevel>> key_int256_two_level;
     std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>> key_string_two_level;
+    std::unique_ptr<AggregationMethodMultiStringNoCache<AggregatedDataWithStringKeyTwoLevel>> multi_key_string_two_level;
     std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>> key_fixed_string_two_level;
     std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt32KeyTwoLevel>> keys32_two_level;
     std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithUInt64KeyTwoLevel>> keys64_two_level;
@@ -457,6 +488,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     M(key32, false)                      \
     M(key64, false)                      \
     M(key_string, false)                 \
+    M(multi_key_string, false)           \
     M(key_fixed_string, false)           \
     M(keys16, false)                     \
     M(keys32, false)                     \
@@ -469,6 +501,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     M(key64_two_level, true)             \
     M(key_int256_two_level, true)        \
     M(key_string_two_level, true)        \
+    M(multi_key_string_two_level, true)  \
     M(key_fixed_string_two_level, true)  \
     M(keys32_two_level, true)            \
     M(keys64_two_level, true)            \
@@ -628,6 +661,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     M(keys128)                                         \
     M(keys256)                                         \
     M(serialized)                                      \
+    M(multi_key_string)                                \
     M(nullable_keys128)                                \
     M(nullable_keys256)
 
@@ -675,6 +709,7 @@ struct AggregatedDataVariants : private boost::noncopyable
     M(keys128_two_level)                \
     M(keys256_two_level)                \
     M(serialized_two_level)             \
+    M(multi_key_string_two_level)       \
     M(nullable_keys128_two_level)       \
     M(nullable_keys256_two_level)
 };
diff --git a/dbms/src/Storages/Transaction/Collator.h b/dbms/src/Storages/Transaction/Collator.h
index 3e7410023bb..7236e21dd62 100644
--- a/dbms/src/Storages/Transaction/Collator.h
+++ b/dbms/src/Storages/Transaction/Collator.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <Columns/Collator.h>
+#include <Storages/Transaction/CollatorCompare.h>
 #include <common/StringRef.h>
 
 #include <memory>
@@ -83,6 +84,16 @@ class ITiDBCollator : public ICollator
     ~ITiDBCollator() override = default;
 
     int compare(const char * s1, size_t length1, const char * s2, size_t length2) const override = 0;
+
+    ALWAYS_INLINE inline int compareFastPath(const char * s1, size_t length1, const char * s2, size_t length2) const
+    {
+        if (likely(isPaddingBinary()))
+        {
+            return DB::BinCollatorCompare<true>(s1, length1, s2, length2);
+        }
+        return compare(s1, length1, s2, length2);
+    }
+
     virtual StringRef sortKey(const char * s, size_t length, std::string & container) const = 0;
     virtual std::unique_ptr<IPattern> pattern() const = 0;
     int32_t getCollatorId() const { return collator_id; }
@@ -90,6 +101,38 @@ class ITiDBCollator : public ICollator
     bool isBinary() const;
     bool isCI() const;
 
+    ALWAYS_INLINE static inline bool isPaddingBinary(CollatorType collator_type)
+    {
+        switch (collator_type)
+        {
+        case CollatorType::UTF8MB4_BIN:
+        case CollatorType::UTF8_BIN:
+        case CollatorType::LATIN1_BIN:
+        case CollatorType::ASCII_BIN:
+        {
+            // collator_type < 4
+            return true;
+        }
+        default:
+            break;
+        }
+        return false;
+    }
+
+    ALWAYS_INLINE inline bool isPaddingBinary() const
+    {
+        return isPaddingBinary(getCollatorType());
+    }
+
+    ALWAYS_INLINE inline StringRef sortKeyFastPath(const char * s, size_t length, std::string & container) const
+    {
+        if (likely(isPaddingBinary()))
+        {
+            return DB::BinCollatorSortKey<true>(s, length);
+        }
+        return sortKey(s, length, container);
+    }
+
 protected:
     explicit ITiDBCollator(int32_t collator_id_);
     int32_t collator_id; // collator id to be compatible with TiDB
diff --git a/dbms/src/Storages/Transaction/CollatorCompare.h b/dbms/src/Storages/Transaction/CollatorCompare.h
new file mode 100644
index 00000000000..2124d9ac839
--- /dev/null
+++ b/dbms/src/Storages/Transaction/CollatorCompare.h
@@ -0,0 +1,98 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <common/StringRef.h>
+#include <common/defines.h>
+#include <common/mem_utils_opt.h>
+
+#include <cstring>
+#include <memory>
+
+namespace DB
+{
+
+template <typename T>
+ALWAYS_INLINE inline int signum(T val)
+{
+    return (0 < val) - (val < 0);
+}
+
+// Check equality is much faster than other comparison.
+// - check size first
+// - return 0 if equal else 1
+FLATTEN_INLINE_PURE static inline int RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
+{
+    return mem_utils::IsStrViewEqual(lhs, rhs) ? 0 : 1;
+}
+
+// Compare str view by memcmp
+FLATTEN_INLINE_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
+{
+    return mem_utils::CompareStrView(v1, v2);
+}
+
+constexpr char SPACE = ' ';
+
+FLATTEN_INLINE_PURE inline std::string_view RightTrimRaw(const std::string_view & v)
+{
+    size_t end = v.find_last_not_of(SPACE);
+    return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1);
+}
+
+// Remove tail space
+FLATTEN_INLINE_PURE inline std::string_view RightTrim(const std::string_view & v)
+{
+    if (likely(v.empty() || v.back() != SPACE))
+        return v;
+    return RightTrimRaw(v);
+}
+
+FLATTEN_INLINE_PURE inline std::string_view RightTrimNoEmpty(const std::string_view & v)
+{
+    if (likely(v.back() != SPACE))
+        return v;
+    return RightTrimRaw(v);
+}
+
+FLATTEN_INLINE_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb)
+{
+    return RawStrCompare(RightTrim(va), RightTrim(vb));
+}
+
+template <bool padding>
+FLATTEN_INLINE_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2)
+{
+    if constexpr (padding)
+        return DB::RtrimStrCompare({s1, length1}, {s2, length2});
+    else
+        return DB::RawStrCompare({s1, length1}, {s2, length2});
+}
+
+template <bool padding>
+FLATTEN_INLINE_PURE inline StringRef BinCollatorSortKey(const char * s, size_t length)
+{
+    if constexpr (padding)
+    {
+        return StringRef(RightTrim({s, length}));
+    }
+    else
+    {
+        return StringRef(s, length);
+    }
+}
+
+
+} // namespace DB
diff --git a/dbms/src/Storages/Transaction/CollatorUtils.h b/dbms/src/Storages/Transaction/CollatorUtils.h
index 3f53791276c..328feac048f 100644
--- a/dbms/src/Storages/Transaction/CollatorUtils.h
+++ b/dbms/src/Storages/Transaction/CollatorUtils.h
@@ -14,86 +14,11 @@
 
 #pragma once
 
-#include <common/StringRef.h>
-#include <common/defines.h>
-#include <common/mem_utils_opt.h>
-
-#include <cstring>
-#include <memory>
+#include <Storages/Transaction/CollatorCompare.h>
 
 namespace DB
 {
 
-template <typename T>
-ALWAYS_INLINE inline int signum(T val)
-{
-    return (0 < val) - (val < 0);
-}
-
-// Check equality is much faster than other comparison.
-// - check size first
-// - return 0 if equal else 1
-FLATTEN_INLINE_PURE static inline int RawStrEqualCompare(const std::string_view & lhs, const std::string_view & rhs)
-{
-    return mem_utils::IsStrViewEqual(lhs, rhs) ? 0 : 1;
-}
-
-// Compare str view by memcmp
-FLATTEN_INLINE_PURE inline int RawStrCompare(const std::string_view & v1, const std::string_view & v2)
-{
-    return v1.compare(v2);
-}
-
-constexpr char SPACE = ' ';
-
-FLATTEN_INLINE_PURE inline std::string_view RightTrimRaw(const std::string_view & v)
-{
-    size_t end = v.find_last_not_of(SPACE);
-    return end == std::string_view::npos ? std::string_view{} : std::string_view(v.data(), end + 1);
-}
-
-// Remove tail space
-FLATTEN_INLINE_PURE inline std::string_view RightTrim(const std::string_view & v)
-{
-    if (likely(v.empty() || v.back() != SPACE))
-        return v;
-    return RightTrimRaw(v);
-}
-
-FLATTEN_INLINE_PURE inline std::string_view RightTrimNoEmpty(const std::string_view & v)
-{
-    if (likely(v.back() != SPACE))
-        return v;
-    return RightTrimRaw(v);
-}
-
-FLATTEN_INLINE_PURE inline int RtrimStrCompare(const std::string_view & va, const std::string_view & vb)
-{
-    return RawStrCompare(RightTrim(va), RightTrim(vb));
-}
-
-template <bool padding>
-FLATTEN_INLINE_PURE inline int BinCollatorCompare(const char * s1, size_t length1, const char * s2, size_t length2)
-{
-    if constexpr (padding)
-        return DB::RtrimStrCompare({s1, length1}, {s2, length2});
-    else
-        return DB::RawStrCompare({s1, length1}, {s2, length2});
-}
-
-template <bool padding>
-FLATTEN_INLINE_PURE inline StringRef BinCollatorSortKey(const char * s, size_t length)
-{
-    if constexpr (padding)
-    {
-        return StringRef(RightTrim({s, length}));
-    }
-    else
-    {
-        return StringRef(s, length);
-    }
-}
-
 // Loop columns and invoke callback for each pair.
 // Remove last zero byte.
 template <typename Chars, typename Offsets, typename F>
diff --git a/dbms/src/Storages/Transaction/TiDB.cpp b/dbms/src/Storages/Transaction/TiDB.cpp
index ed107e99aba..5beda969e01 100644
--- a/dbms/src/Storages/Transaction/TiDB.cpp
+++ b/dbms/src/Storages/Transaction/TiDB.cpp
@@ -246,7 +246,7 @@ Int64 ColumnInfo::getEnumIndex(const String & enum_id_or_text) const
         collator = ITiDBCollator::getCollator("binary");
     for (const auto & elem : elems)
     {
-        if (collator->compare(elem.first.data(), elem.first.size(), enum_id_or_text.data(), enum_id_or_text.size()) == 0)
+        if (collator->compareFastPath(elem.first.data(), elem.first.size(), enum_id_or_text.data(), enum_id_or_text.size()) == 0)
         {
             return elem.second;
         }
@@ -265,12 +265,12 @@ UInt64 ColumnInfo::getSetValue(const String & set_str) const
     Poco::StringTokenizer string_tokens(set_str, ",");
     std::set<String> marked;
     for (const auto & s : string_tokens)
-        marked.insert(collator->sortKey(s.data(), s.length(), sort_key_container).toString());
+        marked.insert(collator->sortKeyFastPath(s.data(), s.length(), sort_key_container).toString());
 
     UInt64 value = 0;
     for (size_t i = 0; i < elems.size(); i++)
     {
-        String key = collator->sortKey(elems.at(i).first.data(), elems.at(i).first.length(), sort_key_container).toString();
+        String key = collator->sortKeyFastPath(elems.at(i).first.data(), elems.at(i).first.length(), sort_key_container).toString();
         auto it = marked.find(key);
         if (it != marked.end())
         {
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
index 8138279e275..13ca92f3a52 100644
--- a/libs/CMakeLists.txt
+++ b/libs/CMakeLists.txt
@@ -21,9 +21,7 @@ add_subdirectory (libcommon)
 add_subdirectory (libpocoext)
 add_subdirectory (libdaemon)
 
-if (USE_INTERNAL_MEMCPY)
-    add_subdirectory (libmemcpy)
-endif()
+add_subdirectory (libmemcpy)
 
 if (GLIBC_COMPATIBILITY)
     add_subdirectory (libglibc-compatibility)
diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt
index 9cb8ec8c146..e5283c086a9 100644
--- a/libs/libcommon/CMakeLists.txt
+++ b/libs/libcommon/CMakeLists.txt
@@ -160,7 +160,7 @@ endif ()
 if (TIFLASH_ENABLE_AVX_SUPPORT)
     # https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
     set_source_files_properties(src/mem_utils_avx2.cpp APPEND COMPILE_FLAGS "-mavx -mavx2")
-    set_source_files_properties(src/avx2_mem_utils_impl.cpp APPEND COMPILE_FLAGS "-mavx -mavx2")
+    set_source_files_properties(src/avx2_mem_utils_impl.cpp APPEND COMPILE_FLAGS "-mavx -mavx2 ${COMPILER_MOVBE_FLAG} -fomit-frame-pointer")
     if (TIFLASH_COMPILER_VPCLMULQDQ_SUPPORT)
         set_source_files_properties(src/crc64_avx2.cpp
                 APPEND COMPILE_FLAGS "-mavx2 -mpclmul -mvpclmulqdq -Wno-ignored-attributes")
diff --git a/libs/libcommon/include/common/StringRef.h b/libs/libcommon/include/common/StringRef.h
index f43c18370d3..d03bd643786 100644
--- a/libs/libcommon/include/common/StringRef.h
+++ b/libs/libcommon/include/common/StringRef.h
@@ -16,6 +16,7 @@
 
 #include <city.h>
 #include <common/mem_utils.h>
+#include <common/mem_utils_opt.h>
 #include <common/types.h>
 #include <common/unaligned.h>
 
@@ -72,6 +73,11 @@ struct StringRef
 
     explicit operator std::string() const { return toString(); }
     constexpr explicit operator std::string_view() const { return {data, size}; }
+
+    ALWAYS_INLINE inline int compare(const StringRef & tar) const
+    {
+        return mem_utils::CompareStrView({*this}, {tar});
+    }
 };
 
 /// Here constexpr doesn't implicate inline, see https://www.viva64.com/en/w/v1043/
@@ -84,16 +90,10 @@ using StringRefs = std::vector<StringRef>;
 
 // According to https://github.com/pingcap/tiflash/pull/5658
 // - if size of memory area is bigger than 1M, instructions about avx512 may begin to get better results
-// - otherwise, use `std::string_view == std::string_view` or `mem_utils::avx2_mem_equal`(under x86-64 with avx2)
+// - otherwise, use `mem_utils::avx2_mem_equal`(under x86-64 with avx2)
 inline bool operator==(StringRef lhs, StringRef rhs)
 {
-    if (lhs.size != rhs.size)
-        return false;
-
-    if (lhs.size == 0)
-        return true;
-
-    return mem_utils::memoryEqual(lhs.data, rhs.data, lhs.size);
+    return mem_utils::IsStrViewEqual({lhs}, {rhs});
 }
 
 inline bool operator!=(StringRef lhs, StringRef rhs)
@@ -103,14 +103,12 @@ inline bool operator!=(StringRef lhs, StringRef rhs)
 
 inline bool operator<(StringRef lhs, StringRef rhs)
 {
-    int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size));
-    return cmp < 0 || (cmp == 0 && lhs.size < rhs.size);
+    return lhs.compare(rhs) < 0;
 }
 
 inline bool operator>(StringRef lhs, StringRef rhs)
 {
-    int cmp = memcmp(lhs.data, rhs.data, std::min(lhs.size, rhs.size));
-    return cmp > 0 || (cmp == 0 && lhs.size > rhs.size);
+    return lhs.compare(rhs) > 0;
 }
 
 
diff --git a/libs/libcommon/include/common/avx2_mem_utils.h b/libs/libcommon/include/common/avx2_mem_utils.h
index 822fb4b7a40..03e078f2f0a 100644
--- a/libs/libcommon/include/common/avx2_mem_utils.h
+++ b/libs/libcommon/include/common/avx2_mem_utils.h
@@ -21,6 +21,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <string_view>
 
 namespace mem_utils::details
@@ -40,7 +41,7 @@ ALWAYS_INLINE static inline T clear_rightmost_bit_one(const T value)
 ALWAYS_INLINE static inline uint32_t rightmost_bit_one_index(const uint32_t value)
 {
     assert(value != 0);
-    return _tzcnt_u32(value);
+    return __builtin_ctz(value);
 }
 
 using Block32 = __m256i;
@@ -91,20 +92,6 @@ FLATTEN_INLINE_PURE static inline int cmp_block1(const void * p1, const void * p
     return int32_t(read<uint8_t>(p1)) - int32_t(read<uint8_t>(p2));
 }
 
-FLATTEN_INLINE_PURE static inline int cmp_block8(const void * p1, const void * p2)
-{
-    // the left most bit may be 1, use std::memcmp(,,8) to use `sbb`
-    /*
-        bswap   rcx
-        bswap   rdx
-        xor     eax, eax
-        cmp     rcx, rdx
-        seta    al
-        sbb     eax, 0
-    */
-    return std::memcmp(p1, p2, 8);
-}
-
 FLATTEN_INLINE_PURE static inline int cmp_block16(const char * p1, const char * p2)
 {
     uint32_t mask = get_block16_cmp_eq_mask(p1, p2); // mask is up to 0xffff
@@ -112,18 +99,37 @@ FLATTEN_INLINE_PURE static inline int cmp_block16(const char * p1, const char *
     if (unlikely(mask != 0))
     {
         auto pos = rightmost_bit_one_index(mask);
-        return cmp_block1(p1 + pos, p2 + pos);
+        int ret = cmp_block1(p1 + pos, p2 + pos);
+        if (ret == 0)
+        {
+            __builtin_unreachable();
+        }
+        else
+        {
+            return ret;
+        }
     }
     return 0;
 }
-FLATTEN_INLINE_PURE static inline int cmp_block32(const char * p1, const char * p2)
+
+template <bool must_not_eq = false>
+FLATTEN_INLINE_PURE static inline int cmp_block32(const char * p1,
+                                                  const char * p2)
 {
     uint32_t mask = get_block32_cmp_eq_mask(p1, p2); // mask is up to 0xffffffff
     mask -= Block32Mask;
-    if (unlikely(mask != 0))
+    if (must_not_eq || unlikely(mask != 0))
     {
         auto pos = rightmost_bit_one_index(mask);
-        return cmp_block1(p1 + pos, p2 + pos);
+        int ret = cmp_block1(p1 + pos, p2 + pos);
+        if (ret == 0)
+        {
+            __builtin_unreachable();
+        }
+        else
+        {
+            return ret;
+        }
     }
     return 0;
 }
@@ -150,42 +156,66 @@ FLATTEN_INLINE_PURE static inline bool check_block32x4_eq(const char * a, const
 
 FLATTEN_INLINE_PURE static inline int cmp_block32x4(const char * a, const char * b)
 {
-    if (check_block32x4_eq(a, b))
+    if (likely(check_block32x4_eq(a, b)))
         return 0;
-    for (size_t i = 0; i < AVX2_UNROLL_NUM - 1; ++i)
+    for (size_t i = 0; i < (AVX2_UNROLL_NUM - 1); ++i)
     {
-        if (auto ret = cmp_block32(a + i * BLOCK32_SIZE, (b + i * BLOCK32_SIZE)); ret)
+        if (auto ret = cmp_block32(a + i * BLOCK32_SIZE, (b + i * BLOCK32_SIZE)); unlikely(ret))
             return ret;
     }
-    return cmp_block32(a + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE, (b + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE));
-}
-FLATTEN_INLINE_PURE static inline uint32_t swap_u32(uint32_t val)
-{
-    return __builtin_bswap32(val);
-}
-FLATTEN_INLINE_PURE static inline uint64_t swap_u64(uint64_t val)
-{
-    return __builtin_bswap64(val);
-}
-
-[[maybe_unused]] FLATTEN_INLINE_PURE static inline uint32_t read_u32_swap(const void * data)
-{
-    return swap_u32(read<uint32_t>(data));
-}
-
-[[maybe_unused]] FLATTEN_INLINE_PURE static inline uint64_t read_u64_swap(const void * data)
-{
-    return swap_u64(read<uint64_t>(data));
+    return cmp_block32<true>(a + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE, (b + (AVX2_UNROLL_NUM - 1) * BLOCK32_SIZE));
 }
 
 // ref: https://github.com/lattera/glibc/blob/master/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
-FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * p2, size_t n)
+FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char * p2, size_t n) noexcept
 {
     constexpr size_t loop_block32x4_size = AVX2_UNROLL_NUM * BLOCK32_SIZE;
 
     // n <= 32
     if (likely(n <= BLOCK32_SIZE))
     {
+#if !defined(AVX2_MEM_CMP_NORMAL_IF_ELSE)
+
+#ifdef M
+        static_assert(false, "`M` is defined");
+#else
+#define M(x)                                  \
+    case (x):                                 \
+    {                                         \
+        return __builtin_memcmp(p1, p2, (x)); \
+    }
+#endif
+        switch (n)
+        {
+            M(0);
+            M(1);
+            M(2);
+            M(3);
+            M(4);
+            M(5);
+            M(6);
+            M(7);
+            M(8);
+            M(9);
+            M(10);
+            M(11);
+            M(12);
+            M(13);
+            M(14);
+            M(15);
+            M(16);
+        default:
+        {
+            // 17~32
+            if (auto ret = cmp_block16(p1, p2); ret)
+                return ret;
+            return cmp_block16(p1 + n - BLOCK16_SIZE, p2 + n - BLOCK16_SIZE);
+        }
+        }
+#undef M
+
+#else
+        // an optional way to check small str
         if (unlikely(n < 2))
         {
             // 0~1
@@ -245,6 +275,7 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char *
                 return ret;
             return cmp_block16(p1 + n - BLOCK16_SIZE, p2 + n - BLOCK16_SIZE);
         }
+#endif
     }
     //  8 * 32 < n
     if (unlikely(8 * BLOCK32_SIZE < n))
@@ -255,10 +286,10 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char *
             return ret;
         {
             // align addr of one data pointer
-            auto offset = BLOCK32_SIZE - OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE);
-            p1 += offset;
-            p2 += offset;
-            n -= offset;
+            auto offset = ssize_t(OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE)) - BLOCK32_SIZE;
+            p1 -= offset;
+            p2 -= offset;
+            n += offset;
         }
 
         for (; n >= loop_block32x4_size;)
@@ -312,13 +343,15 @@ FLATTEN_INLINE_PURE static inline int avx2_mem_cmp(const char * p1, const char *
     }
 }
 
-FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const char * p2, size_t n)
+FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const char * p2, size_t n) noexcept
 {
     constexpr size_t loop_block32x4_size = AVX2_UNROLL_NUM * BLOCK32_SIZE;
 
     // n <= 32
     if (likely(n <= BLOCK32_SIZE))
     {
+#if !defined(AVX2_MEM_EQ_NORMAL_IF_ELSE)
+
 #ifdef M
         static_assert(false, "`M` is defined");
 #else
@@ -359,8 +392,8 @@ FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const cha
         }
 #undef M
 
-// an optional way to check small str
-#if defined(AVX2_MEM_EQ_NORMAL_IF_ELSE)
+#else
+        // an optional way to check small str
         if (unlikely(n < 2))
         {
             // 0~1
@@ -415,10 +448,10 @@ FLATTEN_INLINE_PURE static inline bool avx2_mem_equal(const char * p1, const cha
             return false;
         {
             // align addr of one data pointer
-            auto offset = BLOCK32_SIZE - OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE);
-            p1 += offset;
-            p2 += offset;
-            n -= offset;
+            auto offset = ssize_t(OFFSET_FROM_ALIGNED(size_t(p2), BLOCK32_SIZE)) - BLOCK32_SIZE;
+            p1 -= offset;
+            p2 -= offset;
+            n += offset;
         }
 
         for (; n >= loop_block32x4_size;)
diff --git a/libs/libcommon/include/common/mem_utils_opt.h b/libs/libcommon/include/common/mem_utils_opt.h
index 66057ccdd51..e691a1563aa 100644
--- a/libs/libcommon/include/common/mem_utils_opt.h
+++ b/libs/libcommon/include/common/mem_utils_opt.h
@@ -93,8 +93,7 @@ FLATTEN_INLINE_PURE static inline int CompareStrView(const std::string_view & lh
 
     if (ret == 0)
     {
-        auto a = lhs.size(), b = rhs.size();
-        ret = (a == b) ? 0 : (a < b ? -1 : 1);
+        ret = (lhs.size() == rhs.size()) ? 0 : (lhs.size() < rhs.size() ? -1 : 1);
     }
     return ret;
 #else
diff --git a/libs/libcommon/include/common/memcpy.h b/libs/libcommon/include/common/memcpy.h
new file mode 100644
index 00000000000..fbdbc0f1527
--- /dev/null
+++ b/libs/libcommon/include/common/memcpy.h
@@ -0,0 +1,32 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <common/defines.h>
+
+#include <cstring>
+
+#if defined(__SSE2__)
+#include <common/sse2_memcpy.h>
+#endif
+
+ALWAYS_INLINE static inline void * inline_memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+#if defined(__SSE2__)
+    return sse2_inline_memcpy(dst, src, size);
+#else
+    return std::memcpy(dst, src, size);
+#endif
+}
diff --git a/libs/libcommon/include/common/sse2_memcpy.h b/libs/libcommon/include/common/sse2_memcpy.h
new file mode 100644
index 00000000000..2919bb0b866
--- /dev/null
+++ b/libs/libcommon/include/common/sse2_memcpy.h
@@ -0,0 +1,139 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <common/defines.h>
+#include <emmintrin.h>
+
+#include <cstddef>
+#include <cstdint>
+
+// Custom inline memcpy implementation for TiFlash.
+// - it is recommended to use for inline function with `sse2` supported
+// - it perform better than `legacy::inline_memcpy`(from clickhouse) according to `libs/libcommon/src/tests/bench_memcpy.cpp`
+// - like `std::memcpy`, the behavior is undefined when the source and the destination objects overlap
+// - moving data from register to memory costs more than the reversed way, so it's useful to reduce times about memory copying.
+ALWAYS_INLINE static inline void * sse2_inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
+{
+    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
+    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);
+
+    void * ret = dst;
+
+#if defined(MCP) || defined(MCP_END)
+    static_assert(false);
+#endif
+
+#define MCP_END(n) tiflash_compiler_builtin_memcpy(dst + size - (n), src + size - (n), (n));
+#define MCP(n) tiflash_compiler_builtin_memcpy(dst, src, (n));
+
+    if (likely(size <= 32))
+    {
+        if (unlikely(size <= 1))
+        {
+            if (likely(size == 1))
+            {
+                /// A single byte.
+                *dst = *src;
+            }
+            /// No bytes remaining.
+        }
+        else if (unlikely(size <= 4))
+        {
+            /// Chunks of 2..4 bytes.
+            MCP(2);
+            MCP_END(2);
+        }
+        else if (unlikely(size <= 8))
+        {
+            /// Chunks of 5..8 bytes.
+            MCP(4);
+            MCP_END(4);
+        }
+        else if (unlikely(size <= 16))
+        {
+            /// Chunks of 9..16 bytes.
+            MCP(8);
+            MCP_END(8);
+        }
+        else
+        {
+            /// Chunks of 17..32 bytes.
+            MCP(16);
+            MCP_END(16);
+        }
+    }
+    else
+    {
+        if (unlikely(size > 128))
+        {
+            /// Large size with fully unrolled loop.
+            {
+                MCP(16);
+
+                // reduce instruction: `offset` = or (`dst`, 0xfffffffffffffff0)
+                auto offset = ssize_t(size_t(dst) % 16) - 16;
+                dst -= offset;
+                src -= offset;
+                size += offset;
+            }
+
+            /// Aligned unrolled copy.
+            __m128i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            while (size >= 128)
+            {
+                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 0);
+                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
+                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 2);
+                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 3);
+                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 4);
+                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 5);
+                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 6);
+                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 7);
+                src += 128;
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 0), c0);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 1), c1);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 2), c2);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 3), c3);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 4), c4);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 5), c5);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 6), c6);
+                _mm_store_si128((reinterpret_cast<__m128i *>(dst) + 7), c7);
+                dst += 128;
+
+                size -= 128;
+            }
+        }
+
+        // size <= 128
+
+        while (size > 16)
+        {
+            MCP(16);
+
+            dst += 16;
+            src += 16;
+            size -= 16;
+        }
+
+        // size <= 16
+        MCP_END(16);
+    }
+    return ret;
+
+#undef MCP
+#undef MCP_END
+}
diff --git a/libs/libcommon/src/tests/CMakeLists.txt b/libs/libcommon/src/tests/CMakeLists.txt
index 3383ee50da2..43d33bff010 100644
--- a/libs/libcommon/src/tests/CMakeLists.txt
+++ b/libs/libcommon/src/tests/CMakeLists.txt
@@ -40,15 +40,21 @@ add_executable (gtests_libcommon
     gtest_logger.cpp
     gtest_arithmetic_overflow.cpp
     )
-target_link_libraries (gtests_libcommon gtest_main common)
+target_link_libraries (gtests_libcommon gtest_main common memcpy)
 add_check(gtests_libcommon)
 
+set (bench_libcommon_sources bench_mem_utils.cpp)
+
+if (NOT USE_INTERNAL_MEMCPY)
+    list (APPEND bench_libcommon_sources bench_memcpy.cpp)
+endif ()
+
 add_executable(bench_libcommon
     # TODO: need to fix broken src file if necessary
     # bench_logger.cpp 
-    bench_mem_utils.cpp
+    ${bench_libcommon_sources}
 )
-target_link_libraries(bench_libcommon benchmark::benchmark_main common m)
+target_link_libraries(bench_libcommon benchmark::benchmark_main common m memcpy)
 
 add_executable (dump_variable dump_variable.cpp)
 target_link_libraries (dump_variable clickhouse_common_io)
diff --git a/libs/libcommon/src/tests/bench_mem_utils.cpp b/libs/libcommon/src/tests/bench_mem_utils.cpp
index 7531202ad7f..8ae2c58de30 100644
--- a/libs/libcommon/src/tests/bench_mem_utils.cpp
+++ b/libs/libcommon/src/tests/bench_mem_utils.cpp
@@ -35,6 +35,7 @@ constexpr size_t RESERVE_OFFSET = 200;
 constexpr size_t TEST_ALIGN_SIZE = 64;
 static_assert(RESERVE_OFFSET > TEST_ALIGN_SIZE * 2);
 constexpr char DEFAULT_INIT_CHAR = '0';
+constexpr char DEFAULT_TEST_CHAR = '1';
 static constexpr size_t TEST_ALIGN_OFF_1 = 15;
 static constexpr size_t TEST_ALIGN_OFF_2 = 31;
 
@@ -74,6 +75,36 @@ class MemUtilsEqual : public benchmark::Fixture
     }
 };
 
+template <size_t max_src_size>
+class MemUtilsCmp : public benchmark::Fixture
+{
+protected:
+    std::string inner_data1;
+    std::string inner_data2;
+    std::string_view data1;
+    std::string_view data2;
+
+public:
+    static constexpr size_t max_size = max_src_size;
+
+    void SetUp(const ::benchmark::State & /*state*/) override
+    {
+        inner_data1.resize(max_size + RESERVE_OFFSET, DEFAULT_INIT_CHAR);
+        inner_data2 = inner_data1;
+
+        {
+            const auto * src = reinterpret_cast<const char *>((size_t(inner_data1.data()) + TEST_ALIGN_SIZE - 1) / TEST_ALIGN_SIZE * TEST_ALIGN_SIZE + TEST_ALIGN_OFF_1); // start address not aligned
+            data1 = {src, max_size};
+        }
+
+        {
+            auto * src = reinterpret_cast<char *>((size_t(inner_data2.data()) + TEST_ALIGN_SIZE - 1) / TEST_ALIGN_SIZE * TEST_ALIGN_SIZE + TEST_ALIGN_OFF_2); // start address not aligned
+            src[max_size - 1] = DEFAULT_TEST_CHAR;
+            data2 = {src, max_size};
+        }
+    }
+};
+
 template <size_t max_cnt, size_t max_src_size, size_t max_needle_size>
 class MemUtilsStrStr : public benchmark::Fixture
 {
@@ -124,6 +155,10 @@ ALWAYS_INLINE static inline bool stl_mem_eq(const char * p1, const char * p2, si
 {
     return std::memcmp(p1, p2, n) == 0; // call bcmp@plt
 }
+ALWAYS_INLINE static inline int stl_mem_cmp(const char * p1, const char * p2, size_t n)
+{
+    return std::memcmp(p1, p2, n); // call memcmp@plt
+}
 
 NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p)
 {
@@ -132,24 +167,52 @@ NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p)
 
 // volatile value is used to prevent compiler optimization for fixed context
 
-#define BENCH_MEM_EQ(name1, name2, func, iter_cnt)                   \
-    BENCHMARK_DEFINE_F(name1, name2)                                 \
-    (benchmark::State & state)                                       \
-    {                                                                \
-        [[maybe_unused]] volatile size_t _volatile_flags = 1;        \
-        [[maybe_unused]] volatile size_t cnt = max_size;             \
-        for (auto _ : state)                                         \
-        {                                                            \
-            _volatile_flags = func(data1.data(), data2.data(), cnt); \
-            if constexpr (varify_res)                                \
-            {                                                        \
-                if (unlikely(!_volatile_flags))                      \
-                    exit(-1);                                        \
-            }                                                        \
-        }                                                            \
-    }                                                                \
+#define BENCH_MEM_EQ(name1, name2, func, loop_cnt, iter_cnt)                                 \
+    BENCHMARK_DEFINE_F(name1, name2)                                                         \
+    (benchmark::State & state)                                                               \
+    {                                                                                        \
+        [[maybe_unused]] volatile size_t _volatile_flags = 1;                                \
+        [[maybe_unused]] volatile size_t cnt = max_size;                                     \
+        for (auto _ : state)                                                                 \
+        {                                                                                    \
+            for (size_t i = 0; i < (loop_cnt); ++i)                                          \
+            {                                                                                \
+                _volatile_flags = func(data1.data(), data2.data(), cnt > i ? cnt - i : cnt); \
+                if constexpr (varify_res)                                                    \
+                {                                                                            \
+                    if (unlikely(!_volatile_flags))                                          \
+                        exit(-1);                                                            \
+                }                                                                            \
+            }                                                                                \
+        }                                                                                    \
+    }                                                                                        \
     BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt);
 
+#define BENCH_MEM_CMP(name1, name2, func, loop_cnt, iter_cnt)                        \
+    BENCHMARK_DEFINE_F(name1, name2)                                                 \
+    (benchmark::State & state)                                                       \
+    {                                                                                \
+        [[maybe_unused]] volatile int _volatile_flags = 1;                           \
+        [[maybe_unused]] volatile size_t cnt = max_size;                             \
+        for (auto _ : state)                                                         \
+        {                                                                            \
+            for (size_t i = 0; i < (loop_cnt); ++i)                                  \
+            {                                                                        \
+                size_t ori = cnt;                                                    \
+                size_t n = ori > i ? ori - i : ori;                                  \
+                size_t diff = ori - n;                                               \
+                _volatile_flags = func(data1.data() + diff, data2.data() + diff, n); \
+                if constexpr (varify_res)                                            \
+                {                                                                    \
+                    if (unlikely(!(_volatile_flags < 0)))                            \
+                    {                                                                \
+                        exit(-1);                                                    \
+                    }                                                                \
+                }                                                                    \
+            }                                                                        \
+        }                                                                            \
+    }                                                                                \
+    BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt);
 
 #define BENCH_MEM_STRSTR(name1, name2, func, iter_cnt)                        \
     BENCHMARK_DEFINE_F(name1, name2)                                          \
@@ -174,23 +237,50 @@ NO_INLINE size_t stl_str_find(std::string_view s, std::string_view p)
     BENCHMARK_REGISTER_F(name1, name2)->Iterations(iter_cnt);
 
 
-#define BENCH_MEM_EQ_ALL(max_src_size, iter_cnt)                                                                 \
-    using MemUtilsEqual##_##max_src_size = MemUtilsEqual<max_src_size>;                                          \
-    BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, stl_mem_eq, stl_mem_eq, iter_cnt)                               \
-    BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, mem_utils_memoryEqual_avx512, mem_utils::memoryEqual, iter_cnt) \
-    BENCH_MEM_EQ(MemUtilsEqual##_##max_src_size, avx2_mem_equal, mem_utils::avx2_mem_equal, iter_cnt)
+#define BENCH_MEM_EQ_ALL_IMPL(id, max_src_size, loop_cnt, iter_cnt)                            \
+    using id = MemUtilsEqual<max_src_size>;                                                    \
+    BENCH_MEM_EQ(id, stl_mem_eq, stl_mem_eq, loop_cnt, iter_cnt)                               \
+    BENCH_MEM_EQ(id, mem_utils_memoryEqual_avx512, mem_utils::memoryEqual, loop_cnt, iter_cnt) \
+    BENCH_MEM_EQ(id, avx2_mem_equal, mem_utils::avx2_mem_equal, loop_cnt, iter_cnt)
+
+#define BENCH_MEM_EQ_IMPL_ID(max_src_size, loop_cnt, iter_cnt) MemUtilsEqual##_##max_src_size##_##loop_cnt
+
+#define BENCH_MEM_EQ_ALL(max_src_size, loop_cnt, iter_cnt) \
+    BENCH_MEM_EQ_ALL_IMPL(BENCH_MEM_EQ_IMPL_ID(max_src_size, loop_cnt, iter_cnt), max_src_size, loop_cnt, iter_cnt)
+
+#define BENCH_MEM_CMP_ALL_IMPL(id, max_src_size, loop_cnt, iter_cnt) \
+    using id = MemUtilsCmp<max_src_size>;                            \
+    BENCH_MEM_CMP(id, stl_mem_cmp, stl_mem_cmp, loop_cnt, iter_cnt)  \
+    BENCH_MEM_CMP(id, avx2_mem_cmp, mem_utils::avx2_mem_cmp, loop_cnt, iter_cnt)
+
+#define BENCH_MEM_CMP_IMPL_ID(max_src_size, loop_cnt, iter_cnt) MemUtilsCmp##_##max_src_size##_##loop_cnt
+
+#define BENCH_MEM_CMP_ALL(max_src_size, loop_cnt, iter_cnt) \
+    BENCH_MEM_CMP_ALL_IMPL(BENCH_MEM_CMP_IMPL_ID(max_src_size, loop_cnt, iter_cnt), max_src_size, loop_cnt, iter_cnt)
 
 #define BENCH_MEM_STRSTR_ALL(max_cnt, max_src_size, max_needle_size, iter_cnt)                                          \
     using MemUtilsStrStr##_##max_src_size##_##max_needle_size = MemUtilsStrStr<max_cnt, max_src_size, max_needle_size>; \
     BENCH_MEM_STRSTR(MemUtilsStrStr##_##max_src_size##_##max_needle_size, stl_str_find, stl_str_find, iter_cnt)         \
     BENCH_MEM_STRSTR(MemUtilsStrStr##_##max_src_size##_##max_needle_size, avx2_strstr, mem_utils::avx2_strstr, iter_cnt)
 
-BENCH_MEM_EQ_ALL(13, 2000)
-BENCH_MEM_EQ_ALL(65, 2000)
-BENCH_MEM_EQ_ALL(100, 500)
-BENCH_MEM_EQ_ALL(10000, 500)
-BENCH_MEM_EQ_ALL(100000, 500)
-BENCH_MEM_EQ_ALL(1000000, 200)
+#define BENCH_MEM_EQ_LOOP 20
+
+BENCH_MEM_EQ_ALL(13, BENCH_MEM_EQ_LOOP, 2000)
+BENCH_MEM_EQ_ALL(65, BENCH_MEM_EQ_LOOP, 2000)
+BENCH_MEM_EQ_ALL(100, BENCH_MEM_EQ_LOOP, 500)
+BENCH_MEM_EQ_ALL(10000, BENCH_MEM_EQ_LOOP, 500)
+BENCH_MEM_EQ_ALL(100000, BENCH_MEM_EQ_LOOP, 500)
+BENCH_MEM_EQ_ALL(1000000, BENCH_MEM_EQ_LOOP, 200)
+
+#define BENCH_MEM_CMP_LOOP 20
+
+BENCH_MEM_CMP_ALL(2, BENCH_MEM_CMP_LOOP, 2000)
+BENCH_MEM_CMP_ALL(13, BENCH_MEM_CMP_LOOP, 2000)
+BENCH_MEM_CMP_ALL(65, BENCH_MEM_CMP_LOOP, 2000)
+BENCH_MEM_CMP_ALL(100, BENCH_MEM_CMP_LOOP, 500)
+BENCH_MEM_CMP_ALL(10000, BENCH_MEM_CMP_LOOP, 500)
+BENCH_MEM_CMP_ALL(100000, BENCH_MEM_CMP_LOOP, 500)
+BENCH_MEM_CMP_ALL(1000000, BENCH_MEM_CMP_LOOP, 200)
 
 BENCH_MEM_STRSTR_ALL(512, 1024, 1, 100);
 BENCH_MEM_STRSTR_ALL(512, 1024, 7, 100);
diff --git a/libs/libcommon/src/tests/bench_memcpy.cpp b/libs/libcommon/src/tests/bench_memcpy.cpp
new file mode 100644
index 00000000000..4028dc9ae6a
--- /dev/null
+++ b/libs/libcommon/src/tests/bench_memcpy.cpp
@@ -0,0 +1,139 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+#include <common/memcpy.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <random>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+
+#include "../../libmemcpy/folly/FollyMemcpy.h"
+#include "../../libmemcpy/memcpy.h"
+
+namespace bench
+{
+
+ALWAYS_INLINE static inline void * tiflash_internal_memcpy(void * __restrict dst, const void * __restrict src, size_t size)
+{
+    return sse2_inline_memcpy(dst, src, size);
+}
+
+template <size_t min, size_t max, size_t align, bool hot, size_t loop_cnt_>
+class MemUtilsCopy : public benchmark::Fixture
+{
+protected:
+    static const size_t loop_cnt = loop_cnt_;
+
+    std::string dst_buffer;
+    std::string src_buffer;
+    std::vector<size_t> sizes;
+    std::vector<size_t> dst_offsets;
+    std::vector<size_t> src_offsets;
+
+    void SetUp(const ::benchmark::State & /*state*/) override
+    {
+        size_t src_buffer_size = (sysconf(_SC_PAGE_SIZE) * std::ceil(static_cast<double>(max + 2 * align) / sysconf(_SC_PAGE_SIZE)));
+        size_t dst_buffer_size;
+        if (hot)
+        {
+            dst_buffer_size = src_buffer_size;
+        }
+        else
+        {
+            dst_buffer_size = 1024 * 1024 * 1024; // 1 GiB
+        }
+        dst_buffer.resize(dst_buffer_size);
+        memset(dst_buffer.data(), 'd', dst_buffer.size());
+        src_buffer.resize(src_buffer_size);
+        memset(src_buffer.data(), 's', src_buffer.size());
+
+        std::default_random_engine gen;
+        sizes.resize(4095);
+        std::uniform_int_distribution<size_t> size_dist(min, max);
+        for (auto & size : sizes)
+        {
+            size = size_dist(gen);
+        }
+
+        src_offsets.resize(4096);
+        dst_offsets.resize(4096);
+        std::uniform_int_distribution<size_t> src_offset_dist(
+            0,
+            (src_buffer_size - max) / align);
+        std::uniform_int_distribution<size_t> dst_offset_dist(
+            0,
+            (dst_buffer_size - max) / align);
+        for (size_t i = 0; i < src_offsets.size(); i++)
+        {
+            src_offsets[i] = align * src_offset_dist(gen);
+            dst_offsets[i] = align * dst_offset_dist(gen);
+        }
+    }
+};
+
+#define BENCH_MEM_COPY(id, name, fn_memcpy, iters)                       \
+    BENCHMARK_DEFINE_F(id, name)                                         \
+    (benchmark::State & state)                                           \
+    {                                                                    \
+        for (auto _ : state)                                             \
+        {                                                                \
+            size_t size_idx = 0;                                         \
+            size_t offset_idx = 0;                                       \
+            for (unsigned int i = 0; i < loop_cnt; i++)                  \
+            {                                                            \
+                if (size_idx + 1 == sizes.size())                        \
+                    size_idx = 0;                                        \
+                if (offset_idx >= src_offsets.size())                    \
+                    offset_idx = 0;                                      \
+                void * dst = &dst_buffer[dst_offsets[offset_idx]];       \
+                const void * src = &src_buffer[src_offsets[offset_idx]]; \
+                volatile size_t size = sizes[size_idx];                  \
+                fn_memcpy(dst, src, size);                               \
+                size_idx++;                                              \
+                offset_idx++;                                            \
+            }                                                            \
+        }                                                                \
+    }                                                                    \
+    BENCHMARK_REGISTER_F(id, name)->Iterations(iters);
+
+#define BENCH_MEM_COPY_ALL_IMPL(id, min, max, align, hot, loop_cnt, iters)     \
+    using id = MemUtilsCopy<min, max, align, hot, loop_cnt>;                   \
+    BENCH_MEM_COPY(id, stl_mempy, std::memcpy, iters)                          \
+    BENCH_MEM_COPY(id, inline_clickhouse_memcpy, legacy::inline_memcpy, iters) \
+    BENCH_MEM_COPY(id, inline_tiflash_memcpy, tiflash_internal_memcpy, iters)  \
+    BENCH_MEM_COPY(id, folly_memcpy, __folly_memcpy, iters)
+
+#define BENCH_MEM_IMPL_ID(min, max, align, hot, loop_cnt) MemUtilsCopy##_##min##_##max##_##align##_##hot##_##loop_cnt
+
+#define BENCH_MEM_COPY_ALL(min, max, align, hot, loop_cnt, iters) \
+    BENCH_MEM_COPY_ALL_IMPL(BENCH_MEM_IMPL_ID(min, max, align, hot, loop_cnt), min, max, align, hot, loop_cnt, iters)
+
+BENCH_MEM_COPY_ALL(1, 20, 3, true, 20000, 500);
+BENCH_MEM_COPY_ALL(1, 40, 3, true, 20000, 500);
+BENCH_MEM_COPY_ALL(1, 80, 3, true, 20000, 500);
+BENCH_MEM_COPY_ALL(1, 200, 3, true, 20000, 500);
+BENCH_MEM_COPY_ALL(1, 1000, 3, true, 20000, 500);
+
+
+} // namespace bench
\ No newline at end of file
diff --git a/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp b/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp
index 3895577dca5..49b04ecbb42 100644
--- a/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp
+++ b/libs/libcommon/src/tests/gtest_mem_utils_opt.cpp
@@ -14,6 +14,7 @@
 
 #include <common/defines.h>
 #include <common/mem_utils_opt.h>
+#include <common/memcpy.h>
 #include <fmt/core.h>
 #include <gtest/gtest.h>
 
@@ -25,6 +26,8 @@
 #include <string_view>
 #include <utility>
 
+#include "../../libmemcpy/folly/FollyMemcpy.h"
+
 #if defined(TIFLASH_ENABLE_AVX_SUPPORT)
 
 void TestFunc(size_t size)
@@ -152,6 +155,13 @@ TEST(MemUtilsTestOPT, CompareNormal)
         std::string b(2, char(1));
         ASSERT_EQ(-1,
                   mem_utils::StrFind({start, size}, b));
+        ASSERT_EQ(-1,
+                  mem_utils::avx2_strstr(start, size, b.data(), b.size()));
+    }
+    {
+        std::string a(32, char(0));
+        char * p = a.data() + 16 - size_t(a.data()) % 16 + 5;
+        ASSERT_EQ(nullptr, mem_utils::avx2_memchr(p, 5, char(1)));
     }
 }
 
@@ -172,3 +182,63 @@ TEST(MemUtilsTestOPT, CompareStr)
 }
 
 #endif
+
+template <ssize_t overlap_offset, typename F>
+void TestMemCopyFunc(size_t size, F && fn_memcpy)
+{
+    std::string oa(size + 100, 0);
+    char * start = oa.data();
+    start += (16 - size_t(start) % 16);
+    start += 5;
+    {
+        uint8_t n1 = 1, n2 = 2;
+        for (auto * p = start; p != start + size; ++p)
+        {
+            *p = n1 + n2;
+            n1 = n2;
+            n2 = *p;
+        }
+    }
+
+    std::string ob;
+    char * tar{};
+
+    if constexpr (overlap_offset)
+        tar = start + overlap_offset;
+    else
+    {
+        ob.resize(size + 100, 0);
+        tar = ob.data();
+        tar += (16 - size_t(tar) % 16);
+        tar += 1;
+    }
+
+    fn_memcpy(tar, start, size);
+    {
+        uint8_t n1 = 1, n2 = 2;
+        for (const auto * p = tar; p != tar + size; ++p)
+        {
+            ASSERT_EQ(uint8_t(*p), uint8_t(n1 + n2));
+            n1 = n2;
+            n2 = *p;
+        }
+    }
+}
+
+#if defined(__SSE2__)
+
+TEST(MemUtilsTestOPT, Memcopy)
+{
+    for (size_t size = 0; size < 256; ++size)
+    {
+        TestMemCopyFunc<0>(size, __folly_memcpy);
+        {
+            // test memmove
+            TestMemCopyFunc<3>(size, __folly_memcpy);
+            TestMemCopyFunc<-3>(size, __folly_memcpy);
+        }
+        TestMemCopyFunc<0>(size, inline_memcpy);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/libs/libmemcpy/CMakeLists.txt b/libs/libmemcpy/CMakeLists.txt
index 2efd2cb78e4..a329d64bb49 100644
--- a/libs/libmemcpy/CMakeLists.txt
+++ b/libs/libmemcpy/CMakeLists.txt
@@ -12,5 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library (memcpy STATIC memcpy.cpp)
+option(TIFLASH_FOLLY_MEMCPY_IS_MEMCPY "use folly memcpy as default `memcpy` and `memmove`" ON)
+
+set (memcpy_sources)
+
+# only enbale folly memcpy under linux x86_64 with avx2 supported
+if (ARCH_LINUX AND TIFLASH_ENABLE_AVX_SUPPORT)
+    set_property(SOURCE folly/memcpy.S PROPERTY LANGUAGE CXX)
+    set_property(SOURCE folly/memcpy.S APPEND PROPERTY COMPILE_OPTIONS "-x" "assembler-with-cpp")
+    set_property(SOURCE folly/memcpy.S APPEND PROPERTY COMPILE_FLAGS "-mavx -mavx2")
+    list (APPEND memcpy_sources folly/memcpy.S)
+    message (STATUS "`libmemcpy` support Folly memcpy")
+else ()
+    set (TIFLASH_FOLLY_MEMCPY_IS_MEMCPY OFF)
+endif ()
+
+if (USE_INTERNAL_MEMCPY)
+    if (TIFLASH_FOLLY_MEMCPY_IS_MEMCPY)
+        message (STATUS "Using Folly memcpy as default `memcpy` and `memmove`")
+        add_definitions(-DFOLLY_MEMCPY_IS_MEMCPY=1)
+    else ()
+        message (STATUS "Using internal memcpy")
+        list (APPEND memcpy_sources memcpy.cpp)
+    endif ()
+else ()
+    add_definitions(-DNO_TIFLASH_INTERNAL_MEMCPY=1)
+    list (APPEND memcpy_sources memcpy.cpp)
+endif()
+
+add_library (memcpy STATIC ${memcpy_sources})
 target_include_directories(memcpy PUBLIC ${TiFlash_SOURCE_DIR}/libs/libcommon/include)
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer")
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fomit-frame-pointer")
diff --git a/libs/libmemcpy/folly/FollyMemcpy.h b/libs/libmemcpy/folly/FollyMemcpy.h
new file mode 100644
index 00000000000..4022d1b4b13
--- /dev/null
+++ b/libs/libmemcpy/folly/FollyMemcpy.h
@@ -0,0 +1,40 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author: Logan Evans <lpe@fb.com>
+
+#include <stdlib.h>
+
+extern "C" {
+void * __folly_memcpy(
+    void * __restrict dst,
+    const void * __restrict src,
+    size_t size);
+}
diff --git a/libs/libmemcpy/folly/memcpy.S b/libs/libmemcpy/folly/memcpy.S
new file mode 100644
index 00000000000..acdfcf7d18d
--- /dev/null
+++ b/libs/libmemcpy/folly/memcpy.S
@@ -0,0 +1,479 @@
+// Copyright 2022 PingCAP, Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
+ * AVX2 instructions.
+ *
+ * This implementation of memcpy acts as a memmove: while overlapping copies
+ * are undefined in memcpy, in some implementations they're the same function and
+ * legacy programs rely on this behavior.
+ *
+ * This implementation uses prefetch to avoid dtlb misses. This can
+ * substantially reduce dtlb store misses in cases where the destination
+ * location is absent from L1 cache and where the copy size is small enough
+ * that the hardware prefetcher doesn't have a large impact.
+ *
+ * The number of branches is limited by the use of overlapping loads & stores.
+ * This helps with copies where the source and destination cache lines are already
+ * present in L1 because there are fewer instructions to execute and fewer
+ * branches to potentially mispredict.
+ *   e.g. to copy the last 4 <= n <= 7 bytes: copy the first & last 4 bytes (overlapped):
+ *      movl        (%rsi), %r8d
+ *      movl        -4(%rsi,%rdx), %r9d
+ *      movl        %r8d, (%rdi)
+ *      movl        %r9d, -4(%rdi,%rdx)
+ *
+ *
+ * For sizes up to 256 all source data is first read into registers and then written:
+ * - n <=  16: overlapping movs
+ * - n <=  32: overlapping unaligned 16-byte SSE XMM load/stores
+ * - n <= 256: overlapping unaligned 32-byte AVX YMM load/stores
+ *
+ * Large copies (> 256 bytes) use unaligned loads + aligned stores.
+ * This is observed to always be faster than rep movsb, so the rep movsb
+ * instruction is not used.
+ * - The head & tail may be unaligned => they're always written using unaligned stores.
+ *
+ * If the copy size is humongous (> 32 KiB) and the source and destination are both
+ * aligned, this memcpy will use non-temporal operations (AVX2). This can have
+ * a substantial speedup for copies where data is absent from L1, but it
+ * is significantly slower if the source and destination data were already
+ * in L1. The use of non-temporal operations also has the effect that after
+ * the copy is complete, the data will be moved out of L1, even if the data was
+ * present before the copy started.
+ *
+ * For n > 256 and overlapping src & dst buffers (memmove):
+ * - use unaligned loads + aligned stores, but not non-temporal stores
+ * - for dst < src forward copy in 128 byte batches:
+ *   - unaligned load the first 32 bytes & last 4 x 32 bytes
+ *   - forward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 32 bytes & last 4 x 32 bytes
+ * - for dst > src backward copy in 128 byte batches:
+ *   - unaligned load the first 4 x 32 bytes & last 32 bytes
+ *   - backward copy (unaligned load + aligned stores) 4 x 32 bytes at a time
+ *   - unaligned store the first 4 x 32 bytes & last 32 bytes
+ *
+ * @author Logan Evans <lpe@fb.com>
+ */
+
+#if defined(__AVX2__)
+
+#if defined(PREFETCH)
+#undef PREFETCH
+#endif
+#if __PRFCHW__ // Broadwell+
+#define PREFETCH prefetchw
+#else
+#define PREFETCH prefetcht0
+#endif
+
+// This threshold is half of L1 cache on a Skylake machine, which means that
+// potentially all of L1 will be populated by this copy once it is executed
+// (dst and src are cached for temporal copies).
+#define NON_TEMPORAL_STORE_THRESHOLD $32768
+
+        .file       "memcpy.S"
+        .section    .text,"ax"
+
+        .type       __folly_memcpy_short, @function
+__folly_memcpy_short:
+        .cfi_startproc
+
+.L_GE1_LE7:
+        cmp         $1, %rdx
+        je          .L_EQ1
+
+        cmp         $4, %rdx
+        jae         .L_GE4_LE7
+
+.L_GE2_LE3:
+        movw        (%rsi), %r8w
+        movw        -2(%rsi,%rdx), %r9w
+        movw        %r8w, (%rdi)
+        movw        %r9w, -2(%rdi,%rdx)
+        ret
+
+        .align      2
+.L_EQ1:
+        movb        (%rsi), %r8b
+        movb        %r8b, (%rdi)
+        ret
+
+        // Aligning the target of a jump to an even address has a measurable
+        // speedup in microbenchmarks.
+        .align      2
+.L_GE4_LE7:
+        movl        (%rsi), %r8d
+        movl        -4(%rsi,%rdx), %r9d
+        movl        %r8d, (%rdi)
+        movl        %r9d, -4(%rdi,%rdx)
+        ret
+
+        .cfi_endproc
+        .size       __folly_memcpy_short, .-__folly_memcpy_short
+
+// memcpy is an alternative entrypoint into the function named __folly_memcpy.
+// The compiler is able to call memcpy since the name is global while
+// stacktraces will show __folly_memcpy since that is the name of the function.
+// This is intended to aid in debugging by making it obvious which version of
+// memcpy is being used.
+        .align      64
+        .globl      __folly_memcpy
+        .type       __folly_memcpy, @function
+
+__folly_memcpy:
+        .cfi_startproc
+
+        mov         %rdi, %rax    # return: $rdi
+
+        test        %rdx, %rdx
+        je          .L_EQ0
+
+        PREFETCH    (%rdi)
+        PREFETCH    -1(%rdi,%rdx)
+
+        cmp         $8, %rdx
+        jb          .L_GE1_LE7
+
+.L_GE8:
+        cmp         $32, %rdx
+        ja          .L_GE33
+
+.L_GE8_LE32:
+        cmp         $16, %rdx
+        ja          .L_GE17_LE32
+
+.L_GE8_LE16:
+        mov         (%rsi), %r8
+        mov         -8(%rsi,%rdx), %r9
+        mov         %r8, (%rdi)
+        mov         %r9, -8(%rdi,%rdx)
+.L_EQ0:
+        ret
+
+        .align      2
+.L_GE17_LE32:
+        movdqu      (%rsi), %xmm0
+        movdqu      -16(%rsi,%rdx), %xmm1
+        movdqu      %xmm0, (%rdi)
+        movdqu      %xmm1, -16(%rdi,%rdx)
+        ret
+
+        .align      2
+.L_GE193_LE256:
+        vmovdqu     %ymm3, 96(%rdi)
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+
+.L_GE129_LE192:
+        vmovdqu     %ymm2, 64(%rdi)
+        vmovdqu     %ymm5, -96(%rdi,%rdx)
+
+.L_GE65_LE128:
+        vmovdqu     %ymm1, 32(%rdi)
+        vmovdqu     %ymm6, -64(%rdi,%rdx)
+
+.L_GE33_LE64:
+        vmovdqu     %ymm0, (%rdi)
+        vmovdqu     %ymm7, -32(%rdi,%rdx)
+
+        vzeroupper
+        ret
+
+        .align      2
+.L_GE33:
+        vmovdqu     (%rsi), %ymm0
+        vmovdqu     -32(%rsi,%rdx), %ymm7
+
+        cmp         $64, %rdx
+        jbe         .L_GE33_LE64
+
+        PREFETCH    64(%rdi)
+
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     -64(%rsi,%rdx), %ymm6
+
+        cmp         $128, %rdx
+        jbe         .L_GE65_LE128
+
+        PREFETCH    128(%rdi)
+
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     -96(%rsi,%rdx), %ymm5
+
+        cmp         $192, %rdx
+        jbe         .L_GE129_LE192
+
+        PREFETCH    192(%rdi)
+
+        vmovdqu     96(%rsi), %ymm3
+        vmovdqu     -128(%rsi,%rdx), %ymm4
+
+        cmp         $256, %rdx
+        jbe         .L_GE193_LE256
+
+.L_GE257:
+        PREFETCH    256(%rdi)
+
+        // Check if there is an overlap. If there is an overlap then the caller
+        // has a bug since this is undefined behavior. However, for legacy
+        // reasons this behavior is expected by some callers.
+        //
+        // All copies through 256 bytes will operate as a memmove since for
+        // those sizes all reads are performed before any writes.
+        //
+        // This check uses the idea that there is an overlap if
+        // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
+        // or equivalently, there is no overlap if
+        // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
+        //
+        // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
+        // bytes remain to be copied.
+
+        // (%rsi + %rdx <= %rdi) => no overlap
+        lea         (%rsi,%rdx), %r9
+        cmp         %rdi, %r9
+        jbe         .L_NO_OVERLAP
+
+        // (%rdi + %rdx <= %rsi) => no overlap
+        lea         (%rdi,%rdx), %r8
+        cmp         %rsi, %r8
+        // If no info is available in branch predictor's cache, Intel CPUs assume
+        // forward jumps are not taken. Use a forward jump as overlapping buffers
+        // are unlikely.
+        ja          .L_OVERLAP
+
+        .align      2
+.L_NO_OVERLAP:
+        vmovdqu     %ymm0, (%rdi)
+        vmovdqu     %ymm1, 32(%rdi)
+        vmovdqu     %ymm2, 64(%rdi)
+        vmovdqu     %ymm3, 96(%rdi)
+
+        // Align %rdi to a 32 byte boundary.
+        // %rcx = 128 - 31 & %rdi
+        mov         $128, %rcx
+        and         $31, %rdi
+        sub         %rdi, %rcx
+
+        lea         (%rsi,%rcx), %rsi
+        lea         (%rax,%rcx), %rdi
+        sub         %rcx, %rdx
+
+        // %r8 is the end condition for the loop.
+        lea         -128(%rsi,%rdx), %r8
+
+        cmp         NON_TEMPORAL_STORE_THRESHOLD, %rdx
+        jae         .L_NON_TEMPORAL_LOOP
+
+        .align      2
+.L_ALIGNED_DST_LOOP:
+        PREFETCH    128(%rdi)
+        PREFETCH    192(%rdi)
+
+        vmovdqu     (%rsi), %ymm0
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovdqa     %ymm0, (%rdi)
+        vmovdqa     %ymm1, 32(%rdi)
+        vmovdqa     %ymm2, 64(%rdi)
+        vmovdqa     %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_ALIGNED_DST_LOOP
+
+.L_ALIGNED_DST_LOOP_END:
+        sub         %rsi, %r9
+        mov         %r9, %rdx
+
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+        vmovdqu     %ymm5, -96(%rdi,%rdx)
+        vmovdqu     %ymm6, -64(%rdi,%rdx)
+        vmovdqu     %ymm7, -32(%rdi,%rdx)
+
+        vzeroupper
+        ret
+
+        .align      2
+.L_NON_TEMPORAL_LOOP:
+        testb       $31, %sil
+        jne         .L_ALIGNED_DST_LOOP
+        // This is prefetching the source data unlike ALIGNED_DST_LOOP which
+        // prefetches the destination data. This choice is again informed by
+        // benchmarks. With a non-temporal store the entirety of the cache line
+        // is being written so the previous data can be discarded without being
+        // fetched.
+        prefetchnta 128(%rsi)
+        prefetchnta 196(%rsi)
+
+        vmovntdqa   (%rsi), %ymm0
+        vmovntdqa   32(%rsi), %ymm1
+        vmovntdqa   64(%rsi), %ymm2
+        vmovntdqa   96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovntdq    %ymm0, (%rdi)
+        vmovntdq    %ymm1, 32(%rdi)
+        vmovntdq    %ymm2, 64(%rdi)
+        vmovntdq    %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_NON_TEMPORAL_LOOP
+
+        sfence
+        jmp         .L_ALIGNED_DST_LOOP_END
+
+
+.L_OVERLAP:
+        .align      2
+        cmp         %rdi, %rsi
+        jb          .L_OVERLAP_BWD  // %rsi  < %rdi => backward-copy
+        je          .L_RET          // %rsi == %rdi => return, nothing to copy
+
+        // Source & destination buffers overlap. Forward copy.
+
+        vmovdqu     (%rsi), %ymm8
+
+        // Align %rdi to a 32 byte boundary.
+        // %rcx = 32 - 31 & %rdi
+        mov         $32, %rcx
+        and         $31, %rdi
+        sub         %rdi, %rcx
+
+        lea         (%rsi,%rcx), %rsi
+        lea         (%rax,%rcx), %rdi
+        sub         %rcx, %rdx
+
+        // %r8 is the end condition for the loop.
+        lea         -128(%rsi,%rdx), %r8
+
+
+.L_OVERLAP_FWD_ALIGNED_DST_LOOP:
+        PREFETCH    128(%rdi)
+        PREFETCH    192(%rdi)
+
+        vmovdqu       (%rsi), %ymm0
+        vmovdqu     32(%rsi), %ymm1
+        vmovdqu     64(%rsi), %ymm2
+        vmovdqu     96(%rsi), %ymm3
+        add         $128, %rsi
+
+        vmovdqa     %ymm0,   (%rdi)
+        vmovdqa     %ymm1, 32(%rdi)
+        vmovdqa     %ymm2, 64(%rdi)
+        vmovdqa     %ymm3, 96(%rdi)
+        add         $128, %rdi
+
+        cmp         %r8, %rsi
+        jb          .L_OVERLAP_FWD_ALIGNED_DST_LOOP
+
+        sub         %rsi, %r9
+        mov         %r9, %rdx
+
+        vmovdqu     %ymm4, -128(%rdi,%rdx)
+        vmovdqu     %ymm5,  -96(%rdi,%rdx)
+        vmovdqu     %ymm6,  -64(%rdi,%rdx)
+        vmovdqu     %ymm7,  -32(%rdi,%rdx)
+        vmovdqu     %ymm8, (%rax)  // %rax == the original (unaligned) %rdi
+
+        vzeroupper
+
+.L_RET:
+        ret
+
+.L_OVERLAP_BWD:
+        # Save last 32 bytes.
+        vmovdqu     -32(%rsi, %rdx), %ymm8
+        lea         -32(%rdi, %rdx), %r9
+
+
+        // %r8 is the end condition for the loop.
+        lea         128(%rsi), %r8
+
+        // Align %rdi+%rdx (destination end) to a 32 byte boundary.
+        // %rcx = (%rdi + %rdx - 32) & 31
+        mov         %r9, %rcx
+        and         $31, %rcx
+        // Set %rsi & %rdi to the end of the 32 byte aligned range.
+        sub         %rcx, %rdx
+        add         %rdx, %rsi
+        add         %rdx, %rdi
+
+
+.L_OVERLAP_BWD_ALIGNED_DST_LOOP:
+        PREFETCH    -128(%rdi)
+        PREFETCH    -192(%rdi)
+
+        vmovdqu      -32(%rsi), %ymm4
+        vmovdqu      -64(%rsi), %ymm5
+        vmovdqu      -96(%rsi), %ymm6
+        vmovdqu     -128(%rsi), %ymm7
+        sub         $128, %rsi
+
+        vmovdqa     %ymm4,  -32(%rdi)
+        vmovdqa     %ymm5,  -64(%rdi)
+        vmovdqa     %ymm6,  -96(%rdi)
+        vmovdqa     %ymm7, -128(%rdi)
+        sub         $128, %rdi
+
+        cmp         %r8, %rsi
+        ja          .L_OVERLAP_BWD_ALIGNED_DST_LOOP
+
+        vmovdqu     %ymm0,   (%rax)  // %rax == the original unaligned %rdi
+        vmovdqu     %ymm1, 32(%rax)
+        vmovdqu     %ymm2, 64(%rax)
+        vmovdqu     %ymm3, 96(%rax)
+        vmovdqu     %ymm8, (%r9)
+
+        vzeroupper
+	ret
+
+        .cfi_endproc
+        .size       __folly_memcpy, .-__folly_memcpy
+
+#ifdef FOLLY_MEMCPY_IS_MEMCPY
+        .weak       memcpy
+        memcpy = __folly_memcpy
+
+        .weak       memmove
+        memmove = __folly_memcpy
+#endif
+
+        .ident "GCC: (GNU) 4.8.2"
+
+#endif
+#ifdef __linux__
+        .section .note.GNU-stack,"",@progbits
+#endif
diff --git a/libs/libmemcpy/memcpy.cpp b/libs/libmemcpy/memcpy.cpp
index e32ce6ea441..92fe2389b27 100644
--- a/libs/libmemcpy/memcpy.cpp
+++ b/libs/libmemcpy/memcpy.cpp
@@ -12,11 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "memcpy.h"
+#ifndef NO_TIFLASH_INTERNAL_MEMCPY
+
+#if defined(__SSE2__)
+
+#include <common/sse2_memcpy.h>
 
 /// This is needed to generate an object file for linking.
 
 extern "C" __attribute__((visibility("default"))) void * memcpy(void * __restrict dst, const void * __restrict src, size_t size)
 {
-    return inline_memcpy(dst, src, size);
+    return sse2_inline_memcpy(dst, src, size);
 }
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/libs/libmemcpy/memcpy.h b/libs/libmemcpy/memcpy.h
index a8bd69967f5..e25e16a6f13 100644
--- a/libs/libmemcpy/memcpy.h
+++ b/libs/libmemcpy/memcpy.h
@@ -18,6 +18,9 @@
 #include <cstddef>
 #include <cstdint>
 
+namespace legacy
+{
+
 /** Custom memcpy implementation for ClickHouse.
   * It has the following benefits over using glibc's implementation:
   * 1. Avoiding dependency on specific version of glibc's symbol, like memcpy@@GLIBC_2.14 for portability.
@@ -226,3 +229,4 @@ ALWAYS_INLINE static inline void * inline_memcpy(void * __restrict dst_, const v
     }
     return ret;
 }
+} // namespace legacy
\ No newline at end of file