diff --git a/mak/COPY b/mak/COPY index 4c1719a041..e615dd7606 100644 --- a/mak/COPY +++ b/mak/COPY @@ -20,6 +20,8 @@ COPY=\ $(IMPDIR)\core\thread.d \ $(IMPDIR)\core\time.d \ $(IMPDIR)\core\vararg.d \ + \ + $(IMPDIR)\core\experimental\memutils.d \ \ $(IMPDIR)\core\internal\abort.d \ $(IMPDIR)\core\internal\arrayop.d \ diff --git a/mak/DOCS b/mak/DOCS index fa49be8963..c5ea44bcc2 100644 --- a/mak/DOCS +++ b/mak/DOCS @@ -19,6 +19,8 @@ DOCS=\ $(DOCDIR)\core_gc_config.html \ $(DOCDIR)\core_gc_gcinterface.html \ $(DOCDIR)\core_gc_registry.html \ + \ + $(DOCDIR)\core_experimental_memutils.html \ \ $(DOCDIR)\core_stdc_assert_.html \ $(DOCDIR)\core_stdc_config.html \ diff --git a/mak/SRCS b/mak/SRCS index 309ca0f8d4..9d9d897cb0 100644 --- a/mak/SRCS +++ b/mak/SRCS @@ -16,6 +16,8 @@ SRCS=\ src\core\thread.d \ src\core\time.d \ src\core\vararg.d \ + \ + src\core\experimental\memutils.d \ \ src\core\gc\config.d \ src\core\gc\gcinterface.d \ diff --git a/mak/WINDOWS b/mak/WINDOWS index 8fc6f78e14..2d46889566 100644 --- a/mak/WINDOWS +++ b/mak/WINDOWS @@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d copy $** $@ +$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d + copy $** $@ + $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d copy $** $@ diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d new file mode 100644 index 0000000000..9da20e125e --- /dev/null +++ b/src/core/experimental/memutils.d @@ -0,0 +1,685 @@ +/** + * Pure D replacement of the C Standard Library basic memory building blocks of string.h + * Source: $(DRUNTIMESRC core/experimental/memutils.d) + */ +module core.experimental.memutils; + +void memmove(T)(T *dst, const T *src) +{ + void *d = cast(void*) dst; + const(void) *s = cast(void*) src; + if ((cast(ulong)d - cast(ulong)s) < T.sizeof) + { // There is overlap with dest being ahead. Use backwards move. + Dmemmove_back(d, s, T.sizeof); + } + else if ((cast(ulong)s - cast(ulong)d) < T.sizeof) + { // There is overlap with src being ahead. Use backwards move. + Dmemmove_forw(d, s, T.sizeof); + } + else + { // There is no overlap, use memcpy. + Dmemcpy(dst, src); + } +} + +unittest +{ + real a = 1.2; + real b; + memmove(&b, &a); + assert(b == 1.2); + // Overwrite the type system and create overlap with dst forward. + ubyte[8] buf; + int *p = cast(int*) buf.ptr; + int *q = cast(int*) (buf.ptr + 2); + *p = 203847; + memmove(q, p); + assert(*q == 203847); + // Create overlap with src forward. + *q = 92239; + memmove(p, q); + assert(*p == 92239); +} + +/* Dynamic Arrays + */ +void memmove(T)(T[] dst, const T[] src) +{ + mixin(arrayCode); +} + +/* Static Arrays + */ +void memmove(T, size_t len)(ref T[len] dst, ref const T[len] src) +{ + mixin(arrayCode); +} + +enum arrayCode = " + assert(dst.length == src.length); + void *d = cast(void*) dst.ptr; + const void *s = cast(const(void)*) src.ptr; + size_t n = dst.length * typeof(dst[0]).sizeof; + if ((cast(ulong)d - cast(ulong)s) < n) + { // There is overlap with dest being ahead. Use backwards move. + Dmemmove_back(d, s, n); + } + else if ((cast(ulong)s - cast(ulong)d) < n) + { // There is overlap with src being ahead. Use backwards move. + Dmemmove_forw(d, s, n); + } + else + { // There is no overlap, use memcpy. + pragma(inline, true); + Dmemcpy(d, s, n); + }"; + + + +unittest +{ + const float[3] a = [1.2, 3.4, 5.8]; + float[3] b; + memmove(b, a); + assert(b[0] == 1.2f); + assert(b[1] == 3.4f); + assert(b[2] == 5.8f); +} + +/* Can we use SIMD? + */ +version (D_SIMD) +{ + import core.simd: float4; + enum useSIMD = true; +} +else version (LDC) +{ + // LDC always supports SIMD (but doesn't ever set D_SIMD) and + // the back-end uses the most appropriate size for every target. + import core.simd: float4; + enum useSIMD = true; +} +else version (GNU) +{ + import core.simd: float4; + // GNU does not support SIMD by default. + version (X86_64) + { + enum isX86 = true; + } + else version (X86) + { + enum isX86 = true; + } + + static if (isX86 && __traits(compiles, float4)) + { + enum useSIMD = true; + } + else + { + enum useSIMD = false; + } +} +else +{ + enum useSIMD = false; +} + +/* Little SIMD library + */ +static if (useSIMD) +{ + version (LDC) + { + import ldc.simd: loadUnaligned, storeUnaligned; + } + else version (DigitalMars) + { + import core.simd: void16, loadUnaligned, storeUnaligned; + } + else version (GNU) + { + import gcc.builtins : __builtin_ia32_storeups, __builtin_ia32_loadups; + } + + void store16f_sse(void *dest, float4 reg) + { + version (LDC) + { + storeUnaligned!float4(reg, cast(float*)dest); + } + else version (DigitalMars) + { + storeUnaligned(cast(void16*)dest, reg); + } + else version (GNU) + { + __builtin_ia32_storeups(cast(float*) dest, reg); + } + } + float4 load16f_sse(const(void) *src) + { + version (LDC) + { + return loadUnaligned!(float4)(cast(const(float)*) src); + } + else version (DigitalMars) + { + return loadUnaligned(cast(void16*) src); + } else version (GNU) + { + return __builtin_ia32_loadups(cast(float*) src); + } + } + + void prefetchForward(void *s) + { + enum writeFetch = 0; + enum locality = 3; // -> t0 + version (DigitalMars) + { + import core.simd : prefetch; + prefetch!(writeFetch, locality)(s+0x1a0); + prefetch!(writeFetch, locality)(s+0x280); + } + else version (LDC) + { + import ldc.intrinsics : llvm_prefetch; + enum dataCache = 1; + llvm_prefetch(s+0x1a0, writeFetch, locality, dataCache); + llvm_prefetch(s+0x280, writeFetch, locality, dataCache); + } + else version (GNU) + { + import gcc.builtins : __builtin_prefetch; + __builtin_prefetch(s+0x1a0, writeFetch, locality); + __builtin_prefetch(s+0x280, writeFetch, locality); + } + + } + void lstore128fp_sse(void *d, const(void) *s) + { + prefetchForward(cast(void*) s); + lstore128f_sse(d, s); + } + void lstore128f_sse(void *d, const(void) *s) + { + float4 xmm0 = load16f_sse(cast(const float*)s); + float4 xmm1 = load16f_sse(cast(const float*)(s+16)); + float4 xmm2 = load16f_sse(cast(const float*)(s+32)); + float4 xmm3 = load16f_sse(cast(const float*)(s+48)); + float4 xmm4 = load16f_sse(cast(const float*)(s+64)); + float4 xmm5 = load16f_sse(cast(const float*)(s+80)); + float4 xmm6 = load16f_sse(cast(const float*)(s+96)); + float4 xmm7 = load16f_sse(cast(const float*)(s+112)); + // + store16f_sse(cast(float*)d, xmm0); + store16f_sse(cast(float*)(d+16), xmm1); + store16f_sse(cast(float*)(d+32), xmm2); + store16f_sse(cast(float*)(d+48), xmm3); + store16f_sse(cast(float*)(d+64), xmm4); + store16f_sse(cast(float*)(d+80), xmm5); + store16f_sse(cast(float*)(d+96), xmm6); + store16f_sse(cast(float*)(d+112), xmm7); + } + void lstore64f_sse(void *d, const(void) *s) + { + float4 xmm0 = load16f_sse(cast(const float*)s); + float4 xmm1 = load16f_sse(cast(const float*)(s+16)); + float4 xmm2 = load16f_sse(cast(const float*)(s+32)); + float4 xmm3 = load16f_sse(cast(const float*)(s+48)); + // + store16f_sse(cast(float*)d, xmm0); + store16f_sse(cast(float*)(d+16), xmm1); + store16f_sse(cast(float*)(d+32), xmm2); + store16f_sse(cast(float*)(d+48), xmm3); + } + void lstore32f_sse(void *d, const(void) *s) + { + float4 xmm0 = load16f_sse(cast(const float*)s); + float4 xmm1 = load16f_sse(cast(const float*)(s+16)); + // + store16f_sse(cast(float*)d, xmm0); + store16f_sse(cast(float*)(d+16), xmm1); + } +} + +/* + * + * + * memcpy() implementation + * + * + */ + +/* + * Static implementation + * + */ + +/* Handle static types. + */ +// NOTE(stefanos): Previously, there was more sophisticated code +// for static types. But the rationale of removing it is that +// the compiler knows better how to optimize static types. +pragma(inline, true) +void Dmemcpy(T)(T *dst, const T *src) +{ + *dst = *src; +} + +/* + * Dynamic implementation + * NOTE: Dmemcpy requires _no_ overlap + * + */ +static if (useSIMD) +{ + + +/* Handle dynamic sizes. `d` and `s` must not overlap. + */ +void Dmemcpy(void *d, const(void) *s, size_t n) +{ + if (n <= 128) + { + Dmemcpy_small(d, s, n); + } + else + { + Dmemcpy_large(d, s, n); + } +} + +/* Handle dynamic sizes <= 128. `d` and `s` must not overlap. + */ +void Dmemcpy_small(void *d, const(void) *s, size_t n) +{ + if (n < 16) { + if (n & 0x01) + { + *cast(ubyte*)d = *cast(const ubyte*)s; + ++d; + ++s; + } + if (n & 0x02) + { + *cast(ushort*)d = *cast(const ushort*)s; + d += 2; + s += 2; + } + if (n & 0x04) + { + *cast(uint*)d = *cast(const uint*)s; + d += 4; + s += 4; + } + if (n & 0x08) + { + *cast(ulong*)d = *cast(const ulong*)s; } + return; + } + if (n <= 32) + { + float4 xmm0 = load16f_sse(s); + float4 xmm1 = load16f_sse(s-16+n); + store16f_sse(d, xmm0); + store16f_sse(d-16+n, xmm1); + return; + } + // NOTE(stefanos): I'm writing using load/storeUnaligned() but you possibly can + // achieve greater performance using naked ASM. Be careful that you should either use + // only D or only naked ASM. + if (n <= 64) + { + float4 xmm0 = load16f_sse(s); + float4 xmm1 = load16f_sse(s+16); + float4 xmm2 = load16f_sse(s-32+n); + float4 xmm3 = load16f_sse(s-32+n+16); + store16f_sse(d, xmm0); + store16f_sse(d+16, xmm1); + store16f_sse(d-32+n, xmm2); + store16f_sse(d-32+n+16, xmm3); + return; + } + import core.simd: void16; + lstore64f_sse(d, s); + // NOTE(stefanos): Requires _no_ overlap. + n -= 64; + s = s + n; + d = d + n; + lstore64f_sse(d, s); +} + + +/* Handle dynamic sizes > 128. `d` and `s` must not overlap. + */ +// TODO(stefanos): I tried prefetching. I suppose +// because this is a forward implementation, it should +// actuall reduce performance, but a better check would be good. +// TODO(stefanos): Consider aligning from the end, negate `n` and adding +// every time the `n` (and thus going backwards). That reduces the operations +// inside the loop. +// TODO(stefanos): Consider aligning `n` to 32. This will reduce one operation +// inside the loop but only if the compiler can pick it up (in my tests, it didn't). +// TODO(stefanos): Do a better research on how to inform the compiler about alignment, +// something like assume_aligned. +// NOTE(stefanos): This function requires _no_ overlap. +void Dmemcpy_large(void *d, const(void) *s, size_t n) +{ + // NOTE(stefanos): Alternative - Reach 64-byte + // (cache-line) alignment and use rep movsb + // Good for bigger sizes and only for Intel. + + // Align destination (write) to 32-byte boundary + // NOTE(stefanos): We're using SSE, which needs 16-byte alignment. + // But actually, 32-byte alignment was quite faster (probably because + // the loads / stores are faster and there's the bottleneck). + uint rem = cast(ulong)d & 15; + if (rem) + { + store16f_sse(d, load16f_sse(s)); + s += 16 - rem; + d += 16 - rem; + n -= 16 - rem; + } + + static string loop(string prefetchChoice)() + { + return + " + while (n >= 128) + { + // Aligned stores / writes + " ~ prefetchChoice ~ "(d, s); + d += 128; + s += 128; + n -= 128; + } + "; + } + + if (n >= 20000) + { + mixin(loop!("lstore128fp_sse")()); + } + else + { + mixin(loop!("lstore128f_sse")()); + } + + // NOTE(stefanos): We already have checked that the initial size is >= 128 + // to be here. So, we won't overwrite previous data. + if (n != 0) + { + lstore128f_sse(d - 128 + n, s - 128 + n); + } +} + +} +else +{ + /* Non-SIMD version + */ + // TODO(stefanos): GNU algorithm. + void Dmemcpy(void *d, const(void) *s, size_t n) + { + ubyte *dst = cast(ubyte*) d; + const(ubyte) *src = cast(const(ubyte)*) s; + for (size_t i = 0; i != n; ++i) + { + *dst = *src; + dst++; + src++; + } + } +} + + + +/* + * + * + * memmove() implementation + * + * + */ + +static if (useSIMD) +{ + + +/* Handle dynamic sizes < 64 with backwards move. Overlap is possible. + */ +void Dmemmove_back_lt64(void *d, const(void) *s, size_t n) +{ + if (n & 32) + { + n -= 32; + // IMPORTANT(stefanos): Don't call _store* functions as they copy forward. + // First load both values, _then_ store. + float4 xmm0 = load16f_sse(s+n+16); + float4 xmm1 = load16f_sse(s+n); + store16f_sse(d+n+16, xmm0); + store16f_sse(d+n, xmm1); + } + if (n & 16) + { + n -= 16; + float4 xmm0 = load16f_sse(s+n); + store16f_sse(d+n, xmm0); + } + if (n & 8) + { + n -= 8; + *(cast(ulong*)(d+n)) = *(cast(const ulong*)(s+n)); + } + if (n & 4) + { + n -= 4; + *(cast(uint*)(d+n)) = *(cast(const uint*)(s+n)); + } + if (n & 2) + { + n -= 2; + *(cast(ushort*)(d+n)) = *(cast(const ushort*)(s+n)); + } + if (n & 1) + { + *(cast(ubyte*)d) = *(cast(const ubyte*)s); + } +} + + +/* Handle dynamic sizes with backwards move. Overlap is possible. + */ +void Dmemmove_back(void *d, const(void) *s, size_t n) +{ +START: + if (n < 64) + { + Dmemmove_back_lt64(d, s, n); + return; + } + s += n; + d += n; + if (n < 128) + { + float4 xmm0 = load16f_sse(s-0x10); + float4 xmm1 = load16f_sse(s-0x20); + float4 xmm2 = load16f_sse(s-0x30); + float4 xmm3 = load16f_sse(s-0x40); + store16f_sse(d-0x10, xmm0); + store16f_sse(d-0x20, xmm1); + store16f_sse(d-0x30, xmm2); + store16f_sse(d-0x40, xmm3); + // NOTE(stefanos): We can't do the standard trick where we just go back enough bytes + // so that we can move the last bytes with a 64-byte move even if they're less than 64. + // To do that, we have to _not_ have overlap. + s = s - n; + d = d - n; + n -= 64; + Dmemmove_back_lt64(d, s, n); + return; + } + uint rem = cast(ulong)d & 31; + if (rem) + { + // NOTE(stefanos): Again, can't use the standard trick because of overlap. + Dmemmove_back_lt64(d-rem, s-rem, rem); + s -= rem; + d -= rem; + n -= rem; + } + while (n >= 128) + { + // NOTE(stefanos): No problem with the overlap here since + // we never use overlapped bytes. But, we should still copy backwards. + // NOTE(stefanos): Prefetching had ambiguous and not clear win. + store16f_sse(d-0x10, load16f_sse(s-0x10)); + store16f_sse(d-0x20, load16f_sse(s-0x20)); + store16f_sse(d-0x30, load16f_sse(s-0x30)); + store16f_sse(d-0x40, load16f_sse(s-0x40)); + store16f_sse(d-0x50, load16f_sse(s-0x50)); + store16f_sse(d-0x60, load16f_sse(s-0x60)); + store16f_sse(d-0x70, load16f_sse(s-0x70)); + store16f_sse(d-0x80, load16f_sse(s-0x80)); + s -= 128; + d -= 128; + n -= 128; + } + + if (n) + { + // NOTE(stefanos): Again, can't use the standard trick because of overlap. + // Move pointers to their start. + s -= n; + d -= n; + goto START; + } +} + +/* Handle dynamic sizes < 64 with forwards move. Overlap is possible. + */ +void Dmemmove_forw_lt64(void *d, const(void) *s, size_t n) +{ + if (n & 32) + { + lstore32f_sse(d, s); + n -= 32; + s += 32; + d += 32; + } + if (n & 16) + { + store16f_sse(d, load16f_sse(s)); + n -= 16; + s += 16; + d += 16; + } + if (n & 8) + { + *(cast(ulong*)(d)) = *(cast(const ulong*)(s)); + n -= 8; + s += 8; + d += 8; + } + if (n & 4) + { + n -= 4; + *(cast(uint*)(d)) = *(cast(const uint*)(s)); + n -= 4; + s += 4; + d += 4; + } + if (n & 2) + { + n -= 2; + *(cast(ushort*)(d)) = *(cast(const ushort*)(s)); + n -= 2; + s += 2; + d += 2; + } + if (n & 1) + { + *(cast(ubyte*)d) = *(cast(const ubyte*)s); + } +} + +/* Handle dynamic sizes with forwards move. Overlap is possible. + */ +void Dmemmove_forw(void *d, const(void) *s, size_t n) +{ +START: + if (n < 64) + { + Dmemmove_forw_lt64(d, s, n); + return; + } + if (n < 128) + { + // We know it's >= 64, so move the first 64 bytes freely. + lstore64f_sse(d, s); + // NOTE(stefanos): We can't do the standard trick where we just go forward enough bytes + // so that we can move the last bytes with a 64-byte move even if they're less than 64. + // To do that, we have to _not_ have overlap. + s += 64; + d += 64; + n -= 64; + Dmemmove_forw_lt64(d, s, n); + return; + } + uint rem = cast(ulong)d & 31; + if (rem) + { + // NOTE(stefanos): Again, can't use the standard trick because of overlap. + Dmemmove_forw_lt64(d, s, 32-rem); + s += 32 - rem; + d += 32 - rem; + n -= 32 - rem; + } + + while (n >= 128) + { + // NOTE(stefanos): No problem with the overlap here since + // we never use overlapped bytes. + // NOTE(stefanos): Prefetching had a relatively insignificant + // win for about > 30000. + lstore128f_sse(d, s); + s += 128; + d += 128; + n -= 128; + } + + if (n) + { + // NOTE(stefanos): Again, can't use the standard trick because of overlap. + goto START; + } +} + +} +else +{ + void Dmemmove_forw(void *d, const(void) *s, size_t n) + { + ubyte *dst = cast(ubyte*) d; + const(ubyte) *src = cast(const(ubyte)*) s; + foreach (i; 0 .. n) + { + *(dst+i) = *(src+i); + } + } + + void Dmemmove_back(void *d, const(void) *s, size_t n) + { + ubyte *dst = cast(ubyte*) d; + const(ubyte) *src = cast(const(ubyte)*) s; + foreach_reverse (i; 0 .. n) + { + *(dst+i) = *(src+i); + } + } +}