Skip to content

Commit

Permalink
Make memchr() and memccpy() faster
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Sep 30, 2024
1 parent fef24d6 commit e4d6eb3
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 24 deletions.
39 changes: 22 additions & 17 deletions libc/intrin/memchr.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
#include "libc/dce.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/intel/immintrin.internal.h"
#ifndef __aarch64__

typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));

static inline const unsigned char *memchr_pure(const unsigned char *s,
unsigned char c, size_t n) {
size_t i;
Expand All @@ -35,22 +35,27 @@ static inline const unsigned char *memchr_pure(const unsigned char *s,
}

#if defined(__x86_64__) && !defined(__chibicc__)
static __vex const unsigned char *memchr_sse(const unsigned char *s,
unsigned char c, size_t n) {
size_t i;
unsigned m;
xmm_t v, t = {c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
for (; n >= 16; n -= 16, s += 16) {
v = *(const xmm_t *)s;
m = __builtin_ia32_pmovmskb128(v == t);
if (m) {
m = __builtin_ctzll(m);
return s + m;
}
static const char *memchr_sse(const char *s, char c, size_t n) {
const char *e = s + n;
__m128i t = _mm_set1_epi8(c);
unsigned m, k = (uintptr_t)s & 15;
m = _mm_movemask_epi8(
_mm_cmpeq_epi8(_mm_load_si128((const __m128i *)((uintptr_t)s & -16)), t));
m >>= k;
if (m) {
s += __builtin_ctz(m);
if (s < e)
return s;
return 0;
}
for (i = 0; i < n; ++i) {
if (s[i] == c) {
return s + i;
for (s += 16 - k; s < e; s += 16) {
m = _mm_movemask_epi8(
_mm_cmpeq_epi8(_mm_load_si128((const __m128i *)s), t));
if (m) {
s += __builtin_ctz(m);
if (s < e)
return s;
return 0;
}
}
return 0;
Expand Down
15 changes: 8 additions & 7 deletions libc/str/memccpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,14 @@
* @asyncsignalsafe
*/
void *memccpy(void *dst, const void *src, int c, size_t n) {
char *d;
size_t i;
const char *s;
for (d = dst, s = src, i = 0; i < n; ++i) {
if (((d[i] = s[i]) & 255) == (c & 255)) {
return d + i + 1;
}
const char *p;
// this memchr() call is only correct if your memchr() implementation
// offers the same readahead safety guarantees as cosmopolitan's does
if ((p = memchr(src, c, n))) {
size_t m = p + 1 - (const char *)src;
memmove(dst, src, m);
return (char *)dst + m;
}
memmove(dst, src, n);
return 0;
}
65 changes: 65 additions & 0 deletions test/libc/str/memccpy_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,18 @@
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/intrin/safemacros.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/sysconf.h"
#include "libc/stdio/rand.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/testlib/benchmark.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"

Expand Down Expand Up @@ -50,6 +58,40 @@ TEST(memccpy, testZeroLength_doesNothing) {
EXPECT_EQ(NULL, memccpy(buf, "hi", '\0', 0));
}

TEST(memccpy, fuzz) {
int pagesz = sysconf(_SC_PAGESIZE);
char *map1 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map1 != MAP_FAILED);
npassert(!mprotect(map1 + pagesz, pagesz, PROT_NONE));
char *map2 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map2 != MAP_FAILED);
npassert(!mprotect(map2 + pagesz, pagesz, PROT_NONE));
char *map3 = (char *)mmap(0, pagesz * 2, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
npassert(map3 != MAP_FAILED);
npassert(!mprotect(map3 + pagesz, pagesz, PROT_NONE));
for (int dsize = 1; dsize < 128; ++dsize) {
char *volatile dst1 = map1 + pagesz - dsize;
char *volatile dst2 = map1 + pagesz - dsize;
for (int i = 0; i < dsize; ++i)
dst1[i] = dst2[i] = rand();
for (int ssize = 1; ssize < dsize * 2; ++ssize) {
char *volatile src = map3 + pagesz - (ssize + 1);
for (int i = 0; i < ssize; ++i)
src[i] = max(rand() & 255, 1);
src[ssize] = 0;
ASSERT_EQ(memccpy_pure(dst1, src, 0, dsize),
memccpy(dst2, src, 0, dsize));
ASSERT_EQ(0, memcmp(dst1, dst2, dsize));
}
}
npassert(!munmap(map3, pagesz * 2));
npassert(!munmap(map2, pagesz * 2));
npassert(!munmap(map1, pagesz * 2));
}

TEST(memccpy, memcpy) {
unsigned n, n1, n2;
char *b1, *b2, *b3, *e1, *e2;
Expand Down Expand Up @@ -78,3 +120,26 @@ TEST(memccpy, memcpy) {
free(b1);
}
}

#define N 4096

BENCH(memccpy, bench) {
char dst[N];
char src[N + 1];

printf("\n");
for (int n = 1; n <= N; n *= 2) {
for (int i = 0; i < n; ++i)
src[i] = max(rand() & 255, 1);
src[n] = 0;
BENCHMARK(100, n, X(memccpy(dst, src, 0, V(N))));
}

printf("\n");
for (int n = 1; n <= N; n *= 2) {
for (int i = 0; i < n; ++i)
src[i] = max(rand() & 255, 1);
src[n] = 0;
BENCHMARK(100, n, X(memccpy_pure(dst, src, 0, V(N))));
}
}

0 comments on commit e4d6eb3

Please sign in to comment.