-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Optimize mempcy size thresholds #70049
Conversation
@llvm/pr-subscribers-libc Author: Dmitry Vyukov (dvyukov) ChangesAdjust boundary conditions for sizes = 16/32/64. Results on a machine with AVX2, so sizes 64/128 affected:
Full diff: https://github.com/llvm/llvm-project/pull/70049.diff 1 Files Affected:
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..81c995fff51d6d6 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
@@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
// Aligning 'dst' on a 32B boundary.
@@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
prefetch_to_local_cache(src + kThreeCachelines);
@@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+#if defined(__AVX512F__)
+ constexpr size_t vector_size = 64;
+#elif defined(__AVX__)
+ constexpr size_t vector_size = 32;
+#elif defined(__SSE2__)
+ constexpr size_t vector_size = 16;
+#else
+ constexpr size_t vector_size = 8;
+#endif
if (count == 0)
return;
if (count == 1)
@@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
return builtin::Memcpy<4>::head_tail(dst, src, count);
- if (count < 16)
+ // If count is equal to a power of 2, we can handle it as head-tail
+ // of both smaller size and larger size (head-tail are either
+ // non-overlapping for smaller size, or completely collapsed
+ // for larger size). It seems to be more profitable to do the copy
+ // with the larger size, if it's natively supported (e.g. doing
+ // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+ // But it's not profitable to use larger size if it's not natively
+ // supported: we will both use more instructions and handle fewer
+ // sizes in earlier branches.
+ if (count < 16 + (vector_size <= 8))
return builtin::Memcpy<8>::head_tail(dst, src, count);
- if (count < 32)
+ if (count < 32 + (vector_size <= 16))
return builtin::Memcpy<16>::head_tail(dst, src, count);
- if (count < 64)
+ if (count < 64 + (vector_size <= 32))
return builtin::Memcpy<32>::head_tail(dst, src, count);
if constexpr (x86::kAvx) {
if constexpr (x86::kUseSoftwarePrefetching) {
|
Adjust boundary conditions for sizes = 16/32/64. See the added comment for explanations. Results on a machine with AVX2, so sizes 64/128 affected: │ baseline │ adjusted │ │ sec/op │ sec/op vs base │ memcpy/Google_A 5.701n ± 0% 5.551n ± 1% -2.63% (n=100) memcpy/Google_B 3.817n ± 0% 3.776n ± 0% -1.07% (p=0.000 n=100) memcpy/Google_D 11.35n ± 1% 11.32n ± 0% ~ (p=0.066 n=100) memcpy/Google_U 3.874n ± 1% 3.821n ± 1% -1.37% (p=0.001 n=100) memcpy/64 3.843n ± 0% 3.105n ± 3% -19.22% (n=50) memcpy/128 4.842n ± 0% 3.818n ± 0% -21.15% (p=0.000 n=50)
0029ba7
to
4092097
Compare
I've updated the change to use the same approach as memmove. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Adjust boundary conditions for sizes = 16/32/64.
See the added comment for explanations.
Results on a machine with AVX2, so sizes 64/128 affected: