From 847140e0f8e7a150a5178bb8fd5ba443f025aa7e Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Thu, 4 Jul 2019 22:40:05 +0300 Subject: [PATCH 01/29] memutils: Replacement of libc string.h functions - currently only Dmemset() --- mak/COPY | 2 + mak/DOCS | 2 + mak/SRCS | 2 + mak/WINDOWS | 3 + src/core/experimental/memutils.d | 300 +++++++++++++++++++++++++++++++ 5 files changed, 309 insertions(+) create mode 100644 src/core/experimental/memutils.d diff --git a/mak/COPY b/mak/COPY index 4c1719a041..6086012578 100644 --- a/mak/COPY +++ b/mak/COPY @@ -21,6 +21,8 @@ COPY=\ $(IMPDIR)\core\time.d \ $(IMPDIR)\core\vararg.d \ \ + $(IMPDIR)\core\experimental\memutils.d \ + \ $(IMPDIR)\core\internal\abort.d \ $(IMPDIR)\core\internal\arrayop.d \ $(IMPDIR)\core\internal\convert.d \ diff --git a/mak/DOCS b/mak/DOCS index fa49be8963..c5ea44bcc2 100644 --- a/mak/DOCS +++ b/mak/DOCS @@ -19,6 +19,8 @@ DOCS=\ $(DOCDIR)\core_gc_config.html \ $(DOCDIR)\core_gc_gcinterface.html \ $(DOCDIR)\core_gc_registry.html \ + \ + $(DOCDIR)\core_experimental_memutils.html \ \ $(DOCDIR)\core_stdc_assert_.html \ $(DOCDIR)\core_stdc_config.html \ diff --git a/mak/SRCS b/mak/SRCS index 309ca0f8d4..9d9d897cb0 100644 --- a/mak/SRCS +++ b/mak/SRCS @@ -16,6 +16,8 @@ SRCS=\ src\core\thread.d \ src\core\time.d \ src\core\vararg.d \ + \ + src\core\experimental\memutils.d \ \ src\core\gc\config.d \ src\core\gc\gcinterface.d \ diff --git a/mak/WINDOWS b/mak/WINDOWS index 8fc6f78e14..2d46889566 100644 --- a/mak/WINDOWS +++ b/mak/WINDOWS @@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d copy $** $@ +$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d + copy $** $@ + $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d copy $** $@ diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d new file mode 100644 index 0000000000..df41547aeb --- /dev/null +++ b/src/core/experimental/memutils.d @@ -0,0 +1,300 @@ +/** + * Pure D replacement of the C Standard Library basic memory building blocks of string.h + * + * Source: $(DRUNTIMESRC core/experimental/memutils.d) + */ + +module core.experimental.memutils; + +unittest +{ + Dmemset_testStaticType!(byte)(5); + Dmemset_testStaticType!(ubyte)(5); + Dmemset_testStaticType!(short)(5); + Dmemset_testStaticType!(ushort)(5); + Dmemset_testStaticType!(int)(5); + Dmemset_testStaticType!(uint)(5); + Dmemset_testStaticType!(long)(5); + Dmemset_testStaticType!(ulong)(5); + Dmemset_testStaticType!(float)(5); + Dmemset_testStaticType!(double)(5); + Dmemset_testStaticType!(real)(5); + Dmemset_testDynamicArray!(ubyte)(5, 3); + static foreach(i; 1..10) { + Dmemset_testDynamicArray!(ubyte)(5, 2^^i); + Dmemset_testStaticArray!(ubyte, 2^^i)(5); + } + Dmemset_testDynamicArray!(ubyte)(5, 100); + Dmemset_testStaticArray!(ubyte, 100)(5); + Dmemset_testDynamicArray!(ubyte)(5, 500); + Dmemset_testStaticArray!(ubyte, 500)(5); + Dmemset_testDynamicArray!(ubyte)(5, 700); + Dmemset_testStaticArray!(ubyte, 700)(5); + Dmemset_testDynamicArray!(ubyte)(5, 3434); + Dmemset_testStaticArray!(ubyte, 3434)(5); + Dmemset_testDynamicArray!(ubyte)(5, 7128); + Dmemset_testStaticArray!(ubyte, 7128)(5); + Dmemset_testDynamicArray!(ubyte)(5, 13908); + Dmemset_testStaticArray!(ubyte, 13908)(5); + Dmemset_testDynamicArray!(ubyte)(5, 16343); + Dmemset_testStaticArray!(ubyte, 16343)(5); + Dmemset_testDynamicArray!(ubyte)(5, 27897); + Dmemset_testStaticArray!(ubyte, 27897)(5); + Dmemset_testDynamicArray!(ubyte)(5, 32344); + Dmemset_testStaticArray!(ubyte, 32344)(5); + Dmemset_testDynamicArray!(ubyte)(5, 46830); + Dmemset_testStaticArray!(ubyte, 46830)(5); + Dmemset_testDynamicArray!(ubyte)(5, 64349); + Dmemset_testStaticArray!(ubyte, 64349)(5); +} + +// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk +void escape(void* p) +{ + version(LDC) + { + import ldc.llvmasm; + __asm("", "r,~{memory}", p); + } + version(GNU) + { + asm { "" : : "g" p : "memory"; } + } +} + +void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) a.ptr; + for(size_t i = 0; i < a.length * T.sizeof; i++) + { + assert(p[i] == v); + } +} + +void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) &t; + for(size_t i = 0; i < T.sizeof; i++) + { + assert(p[i] == v); + } +} + +void Dmemset_testDynamicArray(T)(const ubyte v, size_t n) +{ + T[] buf; + buf.length = n + 32; + + enum alignments = 32; + size_t len = n; + + foreach(i; 0..alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + Dmemset_verifyArray(i, d, v); + } +} + +void Dmemset_testStaticArray(T, size_t n)(const ubyte v) +{ + T[n + 32] buf; + + enum alignments = 32; + size_t len = n; + + foreach(i; 0..alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + Dmemset_verifyArray(i, d, v); + } +} + +void Dmemset_testStaticType(T)(const ubyte v) +{ + T t; + escape(&t); + Dmemset(t, v); + Dmemset_verifyStaticType(t, v); +} + +version (GNU) +{ + void Dmemset(void *d, const uint val, size_t n) + { + Dmemset_naive(d, cast(const(ubyte))val, n); + } +} +else +{ + // NOTE(stefanos): I could not a GDC respective of the intrinsics. + void Dmemset(void *d, const uint val, size_t n) + { + import core.simd: int4; + version (LDC) + { + import ldc.simd: loadUnaligned, storeUnaligned; + } + else + version (DigitalMars) + { + import core.simd: void16, loadUnaligned, storeUnaligned; + } + else + { + static assert(0, "Only DMD / LDC are supported"); + } + + // TODO(stefanos): Is there a way to make them @safe? + // (The problem is that for LDC, they could take int* or float* pointers + // but the cast to void16 for DMD is necessary anyway). + + /// Integer /// + + void store32i_sse(void *dest, int4 reg) + { + version (LDC) + { + storeUnaligned!int4(reg, cast(int*)dest); + storeUnaligned!int4(reg, cast(int*)(dest+0x10)); + } + else + { + storeUnaligned(cast(void16*)dest, reg); + storeUnaligned(cast(void16*)(dest+0x10), reg); + } + } + + void store16i_sse(void *dest, int4 reg) + { + version (LDC) + { + storeUnaligned!int4(reg, cast(int*)dest); + } + else + { + storeUnaligned(cast(void16*)dest, reg); + } + } + + // TODO(stefanos): Can we broadcast an int in a float4? That would be useful + // because then we would use only the float versions. + void broadcast_int(ref int4 xmm, int v) + { + xmm[0] = v; + xmm[1] = v; + xmm[2] = v; + xmm[3] = v; + } + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes + + // NOTE(stefanos): I use the naive version, which in my benchmarks was slower + // than the previous classic switch. BUT. Using the switch had a significant + // drop in the rest of the sizes. It's not the branch that is responsible for the drop, + // but the fact that it's more difficult to optimize it as part of the rest of the code. + if (n <= 16) + { + Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n); + return; + } + void *temp = d + n - 0x10; // Used for the last 32 bytes + + int4 xmm0; + // Broadcast v to all bytes. + broadcast_int(xmm0, v); + + ubyte rem = cast(ulong)d & 15; // Remainder from the previous 16-byte boundary. + // Store 16 bytes, from which some will possibly overlap on a future store. + // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, + // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most + // 16, we store 16 bytes anyway. + store16i_sse(d, xmm0); + d += 16 - rem; + n -= 16 - rem; + + // Move in blocks of 32. + // TODO(stefanos): Experiment with differnt sizes. + if (n >= 32) + { + // Align to (previous) multiple of 32. That does something invisible to the code, + // but a good optimizer will avoid a `cmp` instruction inside the loop. With a + // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): + // sub RDX, 32; + // jge START_OF_THE_LOOP. + // Without that, it has to be: + // sub RDX, 32; + // cmp RDX, 32; + // jge START_OF_THE_LOOP + // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means + // we have somehow to compensate for that, which is done at the end of this function. + n &= -32; + do + { + store32i_sse(d, xmm0); + // NOTE(stefanos): I tried avoiding this operation on `d` by combining + // `d` and `n` in the above loop and going backwards. It was slower in my benchs. + d += 32; + n -= 32; + } while(n >= 32); + } + // Compensate for the last (at most) 32 bytes. + store32i_sse(temp-0x10, xmm0); + } +} + +void Dmemset_naive(void *dst, const ubyte val, size_t n) +{ + ubyte *d = cast(ubyte*)dst; + for (size_t i = 0; i != n; ++i) + { + d[i] = val; + } +} + +// NOTE(stefanos): +// Range-checking is not needed since the user never +// pass an `n` (byte count) directly. + +void Dmemset(T)(ref T dst, const ubyte val) +{ + import std.traits; + const uint v = cast(uint)val; + version (X86_64) + { + static if (isArray!T) + { + size_t n = dst.length * typeof(dst[0]).sizeof; + Dmemset(dst.ptr, v, n); + + version (unittest) + { + Dmemset_naive(dst.ptr, v, n); + } + } + else + { + Dmemset(&dst, v, T.sizeof); + + version (unittest) + { + Dmemset_naive(&dst, v, T.sizeof); + } + } + } + else + { + static if (isArray!T) + { + Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); + } + else + { + Dmemset_naive(&dst, val, T.sizeof); + } + } +} From f991173bef05c7fdd6629798f46ab933eaac0a8d Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Thu, 4 Jul 2019 22:57:54 +0300 Subject: [PATCH 02/29] Style fix --- src/core/experimental/memutils.d | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index df41547aeb..758ebc25c4 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -1,9 +1,7 @@ /** * Pure D replacement of the C Standard Library basic memory building blocks of string.h - * * Source: $(DRUNTIMESRC core/experimental/memutils.d) */ - module core.experimental.memutils; unittest @@ -20,7 +18,7 @@ unittest Dmemset_testStaticType!(double)(5); Dmemset_testStaticType!(real)(5); Dmemset_testDynamicArray!(ubyte)(5, 3); - static foreach(i; 1..10) { + static foreach (i; 1..10) { Dmemset_testDynamicArray!(ubyte)(5, 2^^i); Dmemset_testStaticArray!(ubyte, 2^^i)(5); } @@ -51,12 +49,12 @@ unittest // From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk void escape(void* p) { - version(LDC) + version (LDC) { import ldc.llvmasm; __asm("", "r,~{memory}", p); } - version(GNU) + version (GNU) { asm { "" : : "g" p : "memory"; } } @@ -65,7 +63,7 @@ void escape(void* p) void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v) { const ubyte *p = cast(const ubyte *) a.ptr; - for(size_t i = 0; i < a.length * T.sizeof; i++) + for (size_t i = 0; i < a.length * T.sizeof; i++) { assert(p[i] == v); } @@ -74,7 +72,7 @@ void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v) void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v) { const ubyte *p = cast(const ubyte *) &t; - for(size_t i = 0; i < T.sizeof; i++) + for (size_t i = 0; i < T.sizeof; i++) { assert(p[i] == v); } @@ -88,7 +86,7 @@ void Dmemset_testDynamicArray(T)(const ubyte v, size_t n) enum alignments = 32; size_t len = n; - foreach(i; 0..alignments) + foreach (i; 0..alignments) { auto d = buf[i..i+n]; @@ -105,7 +103,7 @@ void Dmemset_testStaticArray(T, size_t n)(const ubyte v) enum alignments = 32; size_t len = n; - foreach(i; 0..alignments) + foreach (i; 0..alignments) { auto d = buf[i..i+n]; @@ -149,13 +147,9 @@ else { static assert(0, "Only DMD / LDC are supported"); } - // TODO(stefanos): Is there a way to make them @safe? // (The problem is that for LDC, they could take int* or float* pointers // but the cast to void16 for DMD is necessary anyway). - - /// Integer /// - void store32i_sse(void *dest, int4 reg) { version (LDC) @@ -169,7 +163,6 @@ else storeUnaligned(cast(void16*)(dest+0x10), reg); } } - void store16i_sse(void *dest, int4 reg) { version (LDC) @@ -181,7 +174,6 @@ else storeUnaligned(cast(void16*)dest, reg); } } - // TODO(stefanos): Can we broadcast an int in a float4? That would be useful // because then we would use only the float versions. void broadcast_int(ref int4 xmm, int v) @@ -192,7 +184,6 @@ else xmm[3] = v; } const uint v = val * 0x01010101; // Broadcast c to all 4 bytes - // NOTE(stefanos): I use the naive version, which in my benchmarks was slower // than the previous classic switch. BUT. Using the switch had a significant // drop in the rest of the sizes. It's not the branch that is responsible for the drop, @@ -203,11 +194,9 @@ else return; } void *temp = d + n - 0x10; // Used for the last 32 bytes - int4 xmm0; // Broadcast v to all bytes. broadcast_int(xmm0, v); - ubyte rem = cast(ulong)d & 15; // Remainder from the previous 16-byte boundary. // Store 16 bytes, from which some will possibly overlap on a future store. // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, @@ -216,7 +205,6 @@ else store16i_sse(d, xmm0); d += 16 - rem; n -= 16 - rem; - // Move in blocks of 32. // TODO(stefanos): Experiment with differnt sizes. if (n >= 32) @@ -240,7 +228,7 @@ else // `d` and `n` in the above loop and going backwards. It was slower in my benchs. d += 32; n -= 32; - } while(n >= 32); + } while (n >= 32); } // Compensate for the last (at most) 32 bytes. store32i_sse(temp-0x10, xmm0); From ea2ce59af42f31570d958c5beb33576c9f39b5cd Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Thu, 4 Jul 2019 23:06:49 +0300 Subject: [PATCH 03/29] Versioning fix --- src/core/experimental/memutils.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 758ebc25c4..31259ad07c 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -252,7 +252,7 @@ void Dmemset(T)(ref T dst, const ubyte val) { import std.traits; const uint v = cast(uint)val; - version (X86_64) + version (D_SIMD) { static if (isArray!T) { From bac120f468f201103e51b89346515f59fbbe114f Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Thu, 4 Jul 2019 23:17:28 +0300 Subject: [PATCH 04/29] Versioning fix vol. 2 --- src/core/experimental/memutils.d | 179 ++++++++++++++++--------------- 1 file changed, 91 insertions(+), 88 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 31259ad07c..4a5fda6fba 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -130,108 +130,111 @@ version (GNU) } else { - // NOTE(stefanos): I could not a GDC respective of the intrinsics. - void Dmemset(void *d, const uint val, size_t n) + version (D_SIMD) { - import core.simd: int4; - version (LDC) - { - import ldc.simd: loadUnaligned, storeUnaligned; - } - else - version (DigitalMars) - { - import core.simd: void16, loadUnaligned, storeUnaligned; - } - else - { - static assert(0, "Only DMD / LDC are supported"); - } - // TODO(stefanos): Is there a way to make them @safe? - // (The problem is that for LDC, they could take int* or float* pointers - // but the cast to void16 for DMD is necessary anyway). - void store32i_sse(void *dest, int4 reg) + // NOTE(stefanos): I could not GDC respective intrinsics. + void Dmemset(void *d, const uint val, size_t n) { + import core.simd: int4; version (LDC) { - storeUnaligned!int4(reg, cast(int*)dest); - storeUnaligned!int4(reg, cast(int*)(dest+0x10)); + import ldc.simd: loadUnaligned, storeUnaligned; } else + version (DigitalMars) { - storeUnaligned(cast(void16*)dest, reg); - storeUnaligned(cast(void16*)(dest+0x10), reg); + import core.simd: void16, loadUnaligned, storeUnaligned; } - } - void store16i_sse(void *dest, int4 reg) - { - version (LDC) + else { - storeUnaligned!int4(reg, cast(int*)dest); + static assert(0, "Only DMD / LDC are supported"); } - else + // TODO(stefanos): Is there a way to make them @safe? + // (The problem is that for LDC, they could take int* or float* pointers + // but the cast to void16 for DMD is necessary anyway). + void store32i_sse(void *dest, int4 reg) { - storeUnaligned(cast(void16*)dest, reg); + version (LDC) + { + storeUnaligned!int4(reg, cast(int*)dest); + storeUnaligned!int4(reg, cast(int*)(dest+0x10)); + } + else + { + storeUnaligned(cast(void16*)dest, reg); + storeUnaligned(cast(void16*)(dest+0x10), reg); + } } - } - // TODO(stefanos): Can we broadcast an int in a float4? That would be useful - // because then we would use only the float versions. - void broadcast_int(ref int4 xmm, int v) - { - xmm[0] = v; - xmm[1] = v; - xmm[2] = v; - xmm[3] = v; - } - const uint v = val * 0x01010101; // Broadcast c to all 4 bytes - // NOTE(stefanos): I use the naive version, which in my benchmarks was slower - // than the previous classic switch. BUT. Using the switch had a significant - // drop in the rest of the sizes. It's not the branch that is responsible for the drop, - // but the fact that it's more difficult to optimize it as part of the rest of the code. - if (n <= 16) - { - Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n); - return; - } - void *temp = d + n - 0x10; // Used for the last 32 bytes - int4 xmm0; - // Broadcast v to all bytes. - broadcast_int(xmm0, v); - ubyte rem = cast(ulong)d & 15; // Remainder from the previous 16-byte boundary. - // Store 16 bytes, from which some will possibly overlap on a future store. - // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, - // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most - // 16, we store 16 bytes anyway. - store16i_sse(d, xmm0); - d += 16 - rem; - n -= 16 - rem; - // Move in blocks of 32. - // TODO(stefanos): Experiment with differnt sizes. - if (n >= 32) - { - // Align to (previous) multiple of 32. That does something invisible to the code, - // but a good optimizer will avoid a `cmp` instruction inside the loop. With a - // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): - // sub RDX, 32; - // jge START_OF_THE_LOOP. - // Without that, it has to be: - // sub RDX, 32; - // cmp RDX, 32; - // jge START_OF_THE_LOOP - // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means - // we have somehow to compensate for that, which is done at the end of this function. - n &= -32; - do + void store16i_sse(void *dest, int4 reg) { - store32i_sse(d, xmm0); - // NOTE(stefanos): I tried avoiding this operation on `d` by combining - // `d` and `n` in the above loop and going backwards. It was slower in my benchs. - d += 32; - n -= 32; - } while (n >= 32); + version (LDC) + { + storeUnaligned!int4(reg, cast(int*)dest); + } + else + { + storeUnaligned(cast(void16*)dest, reg); + } + } + // TODO(stefanos): Can we broadcast an int in a float4? That would be useful + // because then we would use only the float versions. + void broadcast_int(ref int4 xmm, int v) + { + xmm[0] = v; + xmm[1] = v; + xmm[2] = v; + xmm[3] = v; + } + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes + // NOTE(stefanos): I use the naive version, which in my benchmarks was slower + // than the previous classic switch. BUT. Using the switch had a significant + // drop in the rest of the sizes. It's not the branch that is responsible for the drop, + // but the fact that it's more difficult to optimize it as part of the rest of the code. + if (n <= 16) + { + Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n); + return; + } + void *temp = d + n - 0x10; // Used for the last 32 bytes + int4 xmm0; + // Broadcast v to all bytes. + broadcast_int(xmm0, v); + ubyte rem = cast(ulong)d & 15; // Remainder from the previous 16-byte boundary. + // Store 16 bytes, from which some will possibly overlap on a future store. + // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, + // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most + // 16, we store 16 bytes anyway. + store16i_sse(d, xmm0); + d += 16 - rem; + n -= 16 - rem; + // Move in blocks of 32. + // TODO(stefanos): Experiment with differnt sizes. + if (n >= 32) + { + // Align to (previous) multiple of 32. That does something invisible to the code, + // but a good optimizer will avoid a `cmp` instruction inside the loop. With a + // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): + // sub RDX, 32; + // jge START_OF_THE_LOOP. + // Without that, it has to be: + // sub RDX, 32; + // cmp RDX, 32; + // jge START_OF_THE_LOOP + // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means + // we have somehow to compensate for that, which is done at the end of this function. + n &= -32; + do + { + store32i_sse(d, xmm0); + // NOTE(stefanos): I tried avoiding this operation on `d` by combining + // `d` and `n` in the above loop and going backwards. It was slower in my benchs. + d += 32; + n -= 32; + } while (n >= 32); + } + // Compensate for the last (at most) 32 bytes. + store32i_sse(temp-0x10, xmm0); } - // Compensate for the last (at most) 32 bytes. - store32i_sse(temp-0x10, xmm0); } } From ff7e755c54d1d36901351463e45f542c90e0c873 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Thu, 4 Jul 2019 23:42:50 +0300 Subject: [PATCH 05/29] Independency of std.traits --- src/core/experimental/memutils.d | 63 +++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 4a5fda6fba..3b47bcfdf3 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -251,9 +251,70 @@ void Dmemset_naive(void *dst, const ubyte val, size_t n) // Range-checking is not needed since the user never // pass an `n` (byte count) directly. +// Copied from std.traits +import core.internal.traits: Unqual; + +package template ModifyTypePreservingTQ(alias Modifier, T) +{ + static if (is(T U == immutable U)) alias ModifyTypePreservingTQ = immutable Modifier!U; + else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U; + else static if (is(T U == shared inout U)) alias ModifyTypePreservingTQ = shared inout Modifier!U; + else static if (is(T U == shared const U)) alias ModifyTypePreservingTQ = shared const Modifier!U; + else static if (is(T U == shared U)) alias ModifyTypePreservingTQ = shared Modifier!U; + else static if (is(T U == inout const U)) alias ModifyTypePreservingTQ = inout const Modifier!U; + else static if (is(T U == inout U)) alias ModifyTypePreservingTQ = inout Modifier!U; + else static if (is(T U == const U)) alias ModifyTypePreservingTQ = const Modifier!U; + else alias ModifyTypePreservingTQ = Modifier!T; +} + +template OriginalType(T) +{ + template Impl(T) + { + static if (is(T U == enum)) alias Impl = OriginalType!U; + else alias Impl = T; + } + + alias OriginalType = ModifyTypePreservingTQ!(Impl, T); +} + +enum bool isAggregateType(T) = is(T == struct) || is(T == union) || + is(T == class) || is(T == interface); + +private template AliasThisTypeOf(T) +if (isAggregateType!T) +{ + alias members = __traits(getAliasThis, T); + + static if (members.length == 1) + { + alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0])); + } + else + static assert(0, T.stringof~" does not have alias this type"); +} + +template DynamicArrayTypeOf(T) +{ + static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT)) + alias X = DynamicArrayTypeOf!AT; + else + alias X = OriginalType!T; + + static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; }))) + { + alias DynamicArrayTypeOf = X; + } + else + static assert(0, T.stringof~" is not a dynamic array"); +} + +enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; +enum bool isStaticArray(T) = __traits(isStaticArray, T); +enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; + void Dmemset(T)(ref T dst, const ubyte val) { - import std.traits; const uint v = cast(uint)val; version (D_SIMD) { From 497e53f41b1ac5dd6b491014b54fdad1421b1a98 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 11:54:55 +0300 Subject: [PATCH 06/29] Minor fixes/changes --- src/core/experimental/memutils.d | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 3b47bcfdf3..a9df1ac803 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -176,15 +176,6 @@ else storeUnaligned(cast(void16*)dest, reg); } } - // TODO(stefanos): Can we broadcast an int in a float4? That would be useful - // because then we would use only the float versions. - void broadcast_int(ref int4 xmm, int v) - { - xmm[0] = v; - xmm[1] = v; - xmm[2] = v; - xmm[3] = v; - } const uint v = val * 0x01010101; // Broadcast c to all 4 bytes // NOTE(stefanos): I use the naive version, which in my benchmarks was slower // than the previous classic switch. BUT. Using the switch had a significant @@ -196,10 +187,9 @@ else return; } void *temp = d + n - 0x10; // Used for the last 32 bytes - int4 xmm0; // Broadcast v to all bytes. - broadcast_int(xmm0, v); - ubyte rem = cast(ulong)d & 15; // Remainder from the previous 16-byte boundary. + auto xmm0 = int4(v); + ubyte rem = cast(ubyte)d & 15; // Remainder from the previous 16-byte boundary. // Store 16 bytes, from which some will possibly overlap on a future store. // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most @@ -322,20 +312,10 @@ void Dmemset(T)(ref T dst, const ubyte val) { size_t n = dst.length * typeof(dst[0]).sizeof; Dmemset(dst.ptr, v, n); - - version (unittest) - { - Dmemset_naive(dst.ptr, v, n); - } } else { Dmemset(&dst, v, T.sizeof); - - version (unittest) - { - Dmemset_naive(&dst, v, T.sizeof); - } } } else From c52c099795d885bbc162570a73b4a7a4a9acbd39 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 15:13:38 +0300 Subject: [PATCH 07/29] Style and layout changes --- src/core/experimental/memutils.d | 333 +++++++++++++++++-------------- 1 file changed, 183 insertions(+), 150 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index a9df1ac803..00a56d523a 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -4,128 +4,53 @@ */ module core.experimental.memutils; -unittest -{ - Dmemset_testStaticType!(byte)(5); - Dmemset_testStaticType!(ubyte)(5); - Dmemset_testStaticType!(short)(5); - Dmemset_testStaticType!(ushort)(5); - Dmemset_testStaticType!(int)(5); - Dmemset_testStaticType!(uint)(5); - Dmemset_testStaticType!(long)(5); - Dmemset_testStaticType!(ulong)(5); - Dmemset_testStaticType!(float)(5); - Dmemset_testStaticType!(double)(5); - Dmemset_testStaticType!(real)(5); - Dmemset_testDynamicArray!(ubyte)(5, 3); - static foreach (i; 1..10) { - Dmemset_testDynamicArray!(ubyte)(5, 2^^i); - Dmemset_testStaticArray!(ubyte, 2^^i)(5); - } - Dmemset_testDynamicArray!(ubyte)(5, 100); - Dmemset_testStaticArray!(ubyte, 100)(5); - Dmemset_testDynamicArray!(ubyte)(5, 500); - Dmemset_testStaticArray!(ubyte, 500)(5); - Dmemset_testDynamicArray!(ubyte)(5, 700); - Dmemset_testStaticArray!(ubyte, 700)(5); - Dmemset_testDynamicArray!(ubyte)(5, 3434); - Dmemset_testStaticArray!(ubyte, 3434)(5); - Dmemset_testDynamicArray!(ubyte)(5, 7128); - Dmemset_testStaticArray!(ubyte, 7128)(5); - Dmemset_testDynamicArray!(ubyte)(5, 13908); - Dmemset_testStaticArray!(ubyte, 13908)(5); - Dmemset_testDynamicArray!(ubyte)(5, 16343); - Dmemset_testStaticArray!(ubyte, 16343)(5); - Dmemset_testDynamicArray!(ubyte)(5, 27897); - Dmemset_testStaticArray!(ubyte, 27897)(5); - Dmemset_testDynamicArray!(ubyte)(5, 32344); - Dmemset_testStaticArray!(ubyte, 32344)(5); - Dmemset_testDynamicArray!(ubyte)(5, 46830); - Dmemset_testStaticArray!(ubyte, 46830)(5); - Dmemset_testDynamicArray!(ubyte)(5, 64349); - Dmemset_testStaticArray!(ubyte, 64349)(5); -} +/** Dmemset() implementation */ -// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk -void escape(void* p) -{ - version (LDC) - { - import ldc.llvmasm; - __asm("", "r,~{memory}", p); - } - version (GNU) - { - asm { "" : : "g" p : "memory"; } - } -} - -void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v) -{ - const ubyte *p = cast(const ubyte *) a.ptr; - for (size_t i = 0; i < a.length * T.sizeof; i++) - { - assert(p[i] == v); - } -} - -void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v) -{ - const ubyte *p = cast(const ubyte *) &t; - for (size_t i = 0; i < T.sizeof; i++) - { - assert(p[i] == v); - } -} +/** + * NOTE(stefanos): + * Range-checking is not needed since the user never + * pass an `n` (byte count) directly. + */ -void Dmemset_testDynamicArray(T)(const ubyte v, size_t n) +/* + If T is an array,set all `dst`'s bytes + (whose count is the length of the array times + the size of the array element) to `val`. + Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. + */ +void Dmemset(T)(ref T dst, const ubyte val) { - T[] buf; - buf.length = n + 32; - - enum alignments = 32; - size_t len = n; - - foreach (i; 0..alignments) + const uint v = cast(uint) val; + version (D_SIMD) { - auto d = buf[i..i+n]; - - escape(d.ptr); - Dmemset(d, v); - Dmemset_verifyArray(i, d, v); + static if (isArray!T) + { + size_t n = dst.length * typeof(dst[0]).sizeof; + Dmemset(dst.ptr, v, n); + } + else + { + Dmemset(&dst, v, T.sizeof); + } } -} - -void Dmemset_testStaticArray(T, size_t n)(const ubyte v) -{ - T[n + 32] buf; - - enum alignments = 32; - size_t len = n; - - foreach (i; 0..alignments) + else { - auto d = buf[i..i+n]; - - escape(d.ptr); - Dmemset(d, v); - Dmemset_verifyArray(i, d, v); + static if (isArray!T) + { + Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); + } + else + { + Dmemset_naive(&dst, val, T.sizeof); + } } } -void Dmemset_testStaticType(T)(const ubyte v) -{ - T t; - escape(&t); - Dmemset(t, v); - Dmemset_verifyStaticType(t, v); -} - version (GNU) { void Dmemset(void *d, const uint val, size_t n) { - Dmemset_naive(d, cast(const(ubyte))val, n); + Dmemset_naive(d, cast(const(ubyte)) val, n); } } else @@ -135,15 +60,14 @@ else // NOTE(stefanos): I could not GDC respective intrinsics. void Dmemset(void *d, const uint val, size_t n) { - import core.simd: int4; + import core.simd : int4; version (LDC) { - import ldc.simd: loadUnaligned, storeUnaligned; + import ldc.simd : loadUnaligned, storeUnaligned; } - else - version (DigitalMars) + else version (DigitalMars) { - import core.simd: void16, loadUnaligned, storeUnaligned; + import core.simd : void16, loadUnaligned, storeUnaligned; } else { @@ -156,24 +80,24 @@ else { version (LDC) { - storeUnaligned!int4(reg, cast(int*)dest); - storeUnaligned!int4(reg, cast(int*)(dest+0x10)); + storeUnaligned!int4(reg, cast(int*) dest); + storeUnaligned!int4(reg, cast(int*) (dest+0x10)); } else { - storeUnaligned(cast(void16*)dest, reg); - storeUnaligned(cast(void16*)(dest+0x10), reg); + storeUnaligned(cast(void16*) dest, reg); + storeUnaligned(cast(void16*) (dest+0x10), reg); } } void store16i_sse(void *dest, int4 reg) { version (LDC) { - storeUnaligned!int4(reg, cast(int*)dest); + storeUnaligned!int4(reg, cast(int*) dest); } else { - storeUnaligned(cast(void16*)dest, reg); + storeUnaligned(cast(void16*) dest, reg); } } const uint v = val * 0x01010101; // Broadcast c to all 4 bytes @@ -183,13 +107,13 @@ else // but the fact that it's more difficult to optimize it as part of the rest of the code. if (n <= 16) { - Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n); + Dmemset_naive(cast(ubyte*) d, cast(ubyte) val, n); return; } void *temp = d + n - 0x10; // Used for the last 32 bytes // Broadcast v to all bytes. auto xmm0 = int4(v); - ubyte rem = cast(ubyte)d & 15; // Remainder from the previous 16-byte boundary. + ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. // Store 16 bytes, from which some will possibly overlap on a future store. // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most @@ -230,19 +154,36 @@ else void Dmemset_naive(void *dst, const ubyte val, size_t n) { - ubyte *d = cast(ubyte*)dst; - for (size_t i = 0; i != n; ++i) + ubyte *d = cast(ubyte*) dst; + foreach (i; 0 .. n) { d[i] = val; } } -// NOTE(stefanos): -// Range-checking is not needed since the user never -// pass an `n` (byte count) directly. +/** Core features tests. + */ +unittest +{ + ubyte a[3]; + Dmemset(a, 7); + assert(a[0] == 7); + assert(a[1] == 7); + assert(a[2] == 7); + + real b; + Dmemset(b, 9); + ubyte *p = cast(ubyte*) &b; + foreach (i; 0 .. b.sizeof) + { + assert(p[i] == 9); + } +} -// Copied from std.traits -import core.internal.traits: Unqual; + +/** Handy std.traits code, directly copied from there. + */ +import core.internal.traits : Unqual; package template ModifyTypePreservingTQ(alias Modifier, T) { @@ -303,30 +244,122 @@ enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; enum bool isStaticArray(T) = __traits(isStaticArray, T); enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; -void Dmemset(T)(ref T dst, const ubyte val) + +/** Test suite code + */ +unittest { - const uint v = cast(uint)val; - version (D_SIMD) + DmemsetTestStaticType!(byte)(5); + DmemsetTestStaticType!(ubyte)(5); + DmemsetTestStaticType!(short)(5); + DmemsetTestStaticType!(ushort)(5); + DmemsetTestStaticType!(int)(5); + DmemsetTestStaticType!(uint)(5); + DmemsetTestStaticType!(long)(5); + DmemsetTestStaticType!(ulong)(5); + DmemsetTestStaticType!(float)(5); + DmemsetTestStaticType!(double)(5); + DmemsetTestStaticType!(real)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3); + static foreach (i; 1..10) { + DmemsetTestDynamicArray!(ubyte)(5, 2^^i); + DmemsetTestStaticArray!(ubyte, 2^^i)(5); + } + DmemsetTestDynamicArray!(ubyte)(5, 100); + DmemsetTestStaticArray!(ubyte, 100)(5); + DmemsetTestDynamicArray!(ubyte)(5, 500); + DmemsetTestStaticArray!(ubyte, 500)(5); + DmemsetTestDynamicArray!(ubyte)(5, 700); + DmemsetTestStaticArray!(ubyte, 700)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3434); + DmemsetTestStaticArray!(ubyte, 3434)(5); + DmemsetTestDynamicArray!(ubyte)(5, 7128); + DmemsetTestStaticArray!(ubyte, 7128)(5); + DmemsetTestDynamicArray!(ubyte)(5, 13908); + DmemsetTestStaticArray!(ubyte, 13908)(5); + DmemsetTestDynamicArray!(ubyte)(5, 16343); + DmemsetTestStaticArray!(ubyte, 16343)(5); + DmemsetTestDynamicArray!(ubyte)(5, 27897); + DmemsetTestStaticArray!(ubyte, 27897)(5); + DmemsetTestDynamicArray!(ubyte)(5, 32344); + DmemsetTestStaticArray!(ubyte, 32344)(5); + DmemsetTestDynamicArray!(ubyte)(5, 46830); + DmemsetTestStaticArray!(ubyte, 46830)(5); + DmemsetTestDynamicArray!(ubyte)(5, 64349); + DmemsetTestStaticArray!(ubyte, 64349)(5); +} + +// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk +void escape(void* p) +{ + version (LDC) { - static if (isArray!T) - { - size_t n = dst.length * typeof(dst[0]).sizeof; - Dmemset(dst.ptr, v, n); - } - else - { - Dmemset(&dst, v, T.sizeof); - } + import ldc.llvmasm; + __asm("", "r,~{memory}", p); } - else + version (GNU) { - static if (isArray!T) - { - Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); - } - else - { - Dmemset_naive(&dst, val, T.sizeof); - } + asm { "" : : "g" p : "memory"; } + } +} + +void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) a.ptr; + foreach (i; 0 .. (a.length * T.sizeof)) + { + assert(p[i] == v); + } +} + +void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) &t; + foreach (i; 0 .. T.sizeof) + { + assert(p[i] == v); + } +} + +void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) +{ + T[] buf; + buf.length = n + 32; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0 .. alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + DmemsetVerifyArray(i, d, v); + } +} + +void DmemsetTestStaticArray(T, size_t n)(const ubyte v) +{ + T[n + 32] buf; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0..alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + DmemsetVerifyArray(i, d, v); } } + +void DmemsetTestStaticType(T)(const ubyte v) +{ + T t; + escape(&t); + Dmemset(t, v); + DmemsetVerifyStaticType(t, v); +} From 6ebec4bfb4e13df5d6c2a877aa29a5cc8597f556 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 15:29:15 +0300 Subject: [PATCH 08/29] Moved tests to test folder --- src/core/experimental/memutils.d | 122 +------------------------------ test/experimental/Makefile | 17 +++++ test/experimental/src/memutils.d | 118 ++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 121 deletions(-) create mode 100644 test/experimental/Makefile create mode 100644 test/experimental/src/memutils.d diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 00a56d523a..4a325d1d5c 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -165,7 +165,7 @@ void Dmemset_naive(void *dst, const ubyte val, size_t n) */ unittest { - ubyte a[3]; + ubyte[3] a; Dmemset(a, 7); assert(a[0] == 7); assert(a[1] == 7); @@ -243,123 +243,3 @@ template DynamicArrayTypeOf(T) enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; enum bool isStaticArray(T) = __traits(isStaticArray, T); enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; - - -/** Test suite code - */ -unittest -{ - DmemsetTestStaticType!(byte)(5); - DmemsetTestStaticType!(ubyte)(5); - DmemsetTestStaticType!(short)(5); - DmemsetTestStaticType!(ushort)(5); - DmemsetTestStaticType!(int)(5); - DmemsetTestStaticType!(uint)(5); - DmemsetTestStaticType!(long)(5); - DmemsetTestStaticType!(ulong)(5); - DmemsetTestStaticType!(float)(5); - DmemsetTestStaticType!(double)(5); - DmemsetTestStaticType!(real)(5); - DmemsetTestDynamicArray!(ubyte)(5, 3); - static foreach (i; 1..10) { - DmemsetTestDynamicArray!(ubyte)(5, 2^^i); - DmemsetTestStaticArray!(ubyte, 2^^i)(5); - } - DmemsetTestDynamicArray!(ubyte)(5, 100); - DmemsetTestStaticArray!(ubyte, 100)(5); - DmemsetTestDynamicArray!(ubyte)(5, 500); - DmemsetTestStaticArray!(ubyte, 500)(5); - DmemsetTestDynamicArray!(ubyte)(5, 700); - DmemsetTestStaticArray!(ubyte, 700)(5); - DmemsetTestDynamicArray!(ubyte)(5, 3434); - DmemsetTestStaticArray!(ubyte, 3434)(5); - DmemsetTestDynamicArray!(ubyte)(5, 7128); - DmemsetTestStaticArray!(ubyte, 7128)(5); - DmemsetTestDynamicArray!(ubyte)(5, 13908); - DmemsetTestStaticArray!(ubyte, 13908)(5); - DmemsetTestDynamicArray!(ubyte)(5, 16343); - DmemsetTestStaticArray!(ubyte, 16343)(5); - DmemsetTestDynamicArray!(ubyte)(5, 27897); - DmemsetTestStaticArray!(ubyte, 27897)(5); - DmemsetTestDynamicArray!(ubyte)(5, 32344); - DmemsetTestStaticArray!(ubyte, 32344)(5); - DmemsetTestDynamicArray!(ubyte)(5, 46830); - DmemsetTestStaticArray!(ubyte, 46830)(5); - DmemsetTestDynamicArray!(ubyte)(5, 64349); - DmemsetTestStaticArray!(ubyte, 64349)(5); -} - -// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk -void escape(void* p) -{ - version (LDC) - { - import ldc.llvmasm; - __asm("", "r,~{memory}", p); - } - version (GNU) - { - asm { "" : : "g" p : "memory"; } - } -} - -void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v) -{ - const ubyte *p = cast(const ubyte *) a.ptr; - foreach (i; 0 .. (a.length * T.sizeof)) - { - assert(p[i] == v); - } -} - -void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v) -{ - const ubyte *p = cast(const ubyte *) &t; - foreach (i; 0 .. T.sizeof) - { - assert(p[i] == v); - } -} - -void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) -{ - T[] buf; - buf.length = n + 32; - - enum alignments = 32; - size_t len = n; - - foreach (i; 0 .. alignments) - { - auto d = buf[i..i+n]; - - escape(d.ptr); - Dmemset(d, v); - DmemsetVerifyArray(i, d, v); - } -} - -void DmemsetTestStaticArray(T, size_t n)(const ubyte v) -{ - T[n + 32] buf; - - enum alignments = 32; - size_t len = n; - - foreach (i; 0..alignments) - { - auto d = buf[i..i+n]; - - escape(d.ptr); - Dmemset(d, v); - DmemsetVerifyArray(i, d, v); - } -} - -void DmemsetTestStaticType(T)(const ubyte v) -{ - T t; - escape(&t); - Dmemset(t, v); - DmemsetVerifyStaticType(t, v); -} diff --git a/test/experimental/Makefile b/test/experimental/Makefile new file mode 100644 index 0000000000..2dbbd68aae --- /dev/null +++ b/test/experimental/Makefile @@ -0,0 +1,17 @@ +include ../common.mak + +TESTS:=memutils + +.PHONY: all clean +all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS))) + +$(ROOT)/%.done: $(ROOT)/% + @echo Testing $* + $(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS) + @touch $@ + +$(ROOT)/%: $(SRC)/%.d + $(QUIET)$(DMD) $(DFLAGS) -of$@ $< + +clean: + rm -rf $(ROOT) diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d new file mode 100644 index 0000000000..43682baa69 --- /dev/null +++ b/test/experimental/src/memutils.d @@ -0,0 +1,118 @@ +import core.experimental.memutils: Dmemset; + +void main() +{ + DmemsetTestStaticType!(byte)(5); + DmemsetTestStaticType!(ubyte)(5); + DmemsetTestStaticType!(short)(5); + DmemsetTestStaticType!(ushort)(5); + DmemsetTestStaticType!(int)(5); + DmemsetTestStaticType!(uint)(5); + DmemsetTestStaticType!(long)(5); + DmemsetTestStaticType!(ulong)(5); + DmemsetTestStaticType!(float)(5); + DmemsetTestStaticType!(double)(5); + DmemsetTestStaticType!(real)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3); + static foreach (i; 1..10) { + DmemsetTestDynamicArray!(ubyte)(5, 2^^i); + DmemsetTestStaticArray!(ubyte, 2^^i)(5); + } + DmemsetTestDynamicArray!(ubyte)(5, 100); + DmemsetTestStaticArray!(ubyte, 100)(5); + DmemsetTestDynamicArray!(ubyte)(5, 500); + DmemsetTestStaticArray!(ubyte, 500)(5); + DmemsetTestDynamicArray!(ubyte)(5, 700); + DmemsetTestStaticArray!(ubyte, 700)(5); + DmemsetTestDynamicArray!(ubyte)(5, 3434); + DmemsetTestStaticArray!(ubyte, 3434)(5); + DmemsetTestDynamicArray!(ubyte)(5, 7128); + DmemsetTestStaticArray!(ubyte, 7128)(5); + DmemsetTestDynamicArray!(ubyte)(5, 13908); + DmemsetTestStaticArray!(ubyte, 13908)(5); + DmemsetTestDynamicArray!(ubyte)(5, 16343); + DmemsetTestStaticArray!(ubyte, 16343)(5); + DmemsetTestDynamicArray!(ubyte)(5, 27897); + DmemsetTestStaticArray!(ubyte, 27897)(5); + DmemsetTestDynamicArray!(ubyte)(5, 32344); + DmemsetTestStaticArray!(ubyte, 32344)(5); + DmemsetTestDynamicArray!(ubyte)(5, 46830); + DmemsetTestStaticArray!(ubyte, 46830)(5); + DmemsetTestDynamicArray!(ubyte)(5, 64349); + DmemsetTestStaticArray!(ubyte, 64349)(5); +} + +// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk +void escape(void* p) +{ + version (LDC) + { + import ldc.llvmasm; + __asm("", "r,~{memory}", p); + } + version (GNU) + { + asm { "" : : "g" p : "memory"; } + } +} + +void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) a.ptr; + foreach (i; 0 .. (a.length * T.sizeof)) + { + assert(p[i] == v); + } +} + +void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v) +{ + const ubyte *p = cast(const ubyte *) &t; + foreach (i; 0 .. T.sizeof) + { + assert(p[i] == v); + } +} + +void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) +{ + T[] buf; + buf.length = n + 32; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0 .. alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + DmemsetVerifyArray(i, d, v); + } +} + +void DmemsetTestStaticArray(T, size_t n)(const ubyte v) +{ + T[n + 32] buf; + + enum alignments = 32; + size_t len = n; + + foreach (i; 0..alignments) + { + auto d = buf[i..i+n]; + + escape(d.ptr); + Dmemset(d, v); + DmemsetVerifyArray(i, d, v); + } +} + +void DmemsetTestStaticType(T)(const ubyte v) +{ + T t; + escape(&t); + Dmemset(t, v); + DmemsetVerifyStaticType(t, v); +} From 57552eda1392b5b995389c1f559d6482d1241f77 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 15:53:32 +0300 Subject: [PATCH 09/29] More naming and style changes --- src/core/experimental/memutils.d | 16 ++++++++-------- test/experimental/src/memutils.d | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 4a325d1d5c..5a2420ee5e 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -18,7 +18,7 @@ module core.experimental.memutils; the size of the array element) to `val`. Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. */ -void Dmemset(T)(ref T dst, const ubyte val) +void memset(T)(ref T dst, const ubyte val) { const uint v = cast(uint) val; version (D_SIMD) @@ -37,20 +37,20 @@ void Dmemset(T)(ref T dst, const ubyte val) { static if (isArray!T) { - Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); + DmemsetNaive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); } else { - Dmemset_naive(&dst, val, T.sizeof); + DmemsetNaive(&dst, val, T.sizeof); } } } version (GNU) { - void Dmemset(void *d, const uint val, size_t n) + private void Dmemset(void *d, const uint val, size_t n) { - Dmemset_naive(d, cast(const(ubyte)) val, n); + DmemsetNaive(d, cast(const(ubyte)) val, n); } } else @@ -58,7 +58,7 @@ else version (D_SIMD) { // NOTE(stefanos): I could not GDC respective intrinsics. - void Dmemset(void *d, const uint val, size_t n) + private void Dmemset(void *d, const uint val, size_t n) { import core.simd : int4; version (LDC) @@ -107,7 +107,7 @@ else // but the fact that it's more difficult to optimize it as part of the rest of the code. if (n <= 16) { - Dmemset_naive(cast(ubyte*) d, cast(ubyte) val, n); + DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n); return; } void *temp = d + n - 0x10; // Used for the last 32 bytes @@ -152,7 +152,7 @@ else } } -void Dmemset_naive(void *dst, const ubyte val, size_t n) +private void DmemsetNaive(void *dst, const ubyte val, size_t n) { ubyte *d = cast(ubyte*) dst; foreach (i; 0 .. n) diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d index 43682baa69..8a30fc3217 100644 --- a/test/experimental/src/memutils.d +++ b/test/experimental/src/memutils.d @@ -1,4 +1,4 @@ -import core.experimental.memutils: Dmemset; +import core.experimental.memutils : memset; void main() { @@ -87,7 +87,7 @@ void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) auto d = buf[i..i+n]; escape(d.ptr); - Dmemset(d, v); + memset(d, v); DmemsetVerifyArray(i, d, v); } } @@ -104,7 +104,7 @@ void DmemsetTestStaticArray(T, size_t n)(const ubyte v) auto d = buf[i..i+n]; escape(d.ptr); - Dmemset(d, v); + memset(d, v); DmemsetVerifyArray(i, d, v); } } @@ -113,6 +113,6 @@ void DmemsetTestStaticType(T)(const ubyte v) { T t; escape(&t); - Dmemset(t, v); + memset(t, v); DmemsetVerifyStaticType(t, v); } From 60b39670eb8910c580b483bf98dbf1a2e53bd014 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 15:56:56 +0300 Subject: [PATCH 10/29] Minor fix --- src/core/experimental/memutils.d | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 5a2420ee5e..c12b6940d6 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -166,13 +166,13 @@ private void DmemsetNaive(void *dst, const ubyte val, size_t n) unittest { ubyte[3] a; - Dmemset(a, 7); + memset(a, 7); assert(a[0] == 7); assert(a[1] == 7); assert(a[2] == 7); real b; - Dmemset(b, 9); + memset(b, 9); ubyte *p = cast(ubyte*) &b; foreach (i; 0 .. b.sizeof) { From 4faa8f8dad8e2631b5e4a70065e518e4fc523514 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 17:00:55 +0300 Subject: [PATCH 11/29] Versioning improvement --- src/core/experimental/memutils.d | 202 +++++++++++++++---------------- 1 file changed, 100 insertions(+), 102 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index c12b6940d6..cffa0d4fdb 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -4,7 +4,7 @@ */ module core.experimental.memutils; -/** Dmemset() implementation */ +/** memset() implementation */ /** * NOTE(stefanos): @@ -17,32 +17,19 @@ module core.experimental.memutils; (whose count is the length of the array times the size of the array element) to `val`. Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. - */ +*/ + void memset(T)(ref T dst, const ubyte val) { const uint v = cast(uint) val; - version (D_SIMD) + static if (isArray!T) { - static if (isArray!T) - { - size_t n = dst.length * typeof(dst[0]).sizeof; - Dmemset(dst.ptr, v, n); - } - else - { - Dmemset(&dst, v, T.sizeof); - } + size_t n = dst.length * typeof(dst[0]).sizeof; + Dmemset(dst.ptr, v, n); } else { - static if (isArray!T) - { - DmemsetNaive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof); - } - else - { - DmemsetNaive(&dst, val, T.sizeof); - } + Dmemset(&dst, v, T.sizeof); } } @@ -50,109 +37,119 @@ version (GNU) { private void Dmemset(void *d, const uint val, size_t n) { - DmemsetNaive(d, cast(const(ubyte)) val, n); + memsetNaive(d, val, n); } } else +version (D_SIMD) { - version (D_SIMD) + /* SIMD implementation + */ + private void Dmemset(void *d, const uint val, size_t n) { - // NOTE(stefanos): I could not GDC respective intrinsics. - private void Dmemset(void *d, const uint val, size_t n) + import core.simd : int4; + version (LDC) + { + import ldc.simd : loadUnaligned, storeUnaligned; + } + else version (DigitalMars) + { + import core.simd : void16, loadUnaligned, storeUnaligned; + } + else + { + static assert(0, "Only DMD / LDC are supported"); + } + // TODO(stefanos): Is there a way to make them @safe? + // (The problem is that for LDC, they could take int* or float* pointers + // but the cast to void16 for DMD is necessary anyway). + void store32i_sse(void *dest, int4 reg) { - import core.simd : int4; version (LDC) { - import ldc.simd : loadUnaligned, storeUnaligned; - } - else version (DigitalMars) - { - import core.simd : void16, loadUnaligned, storeUnaligned; + storeUnaligned!int4(reg, cast(int*) dest); + storeUnaligned!int4(reg, cast(int*) (dest+0x10)); } else { - static assert(0, "Only DMD / LDC are supported"); - } - // TODO(stefanos): Is there a way to make them @safe? - // (The problem is that for LDC, they could take int* or float* pointers - // but the cast to void16 for DMD is necessary anyway). - void store32i_sse(void *dest, int4 reg) - { - version (LDC) - { - storeUnaligned!int4(reg, cast(int*) dest); - storeUnaligned!int4(reg, cast(int*) (dest+0x10)); - } - else - { - storeUnaligned(cast(void16*) dest, reg); - storeUnaligned(cast(void16*) (dest+0x10), reg); - } + storeUnaligned(cast(void16*) dest, reg); + storeUnaligned(cast(void16*) (dest+0x10), reg); } - void store16i_sse(void *dest, int4 reg) + } + void store16i_sse(void *dest, int4 reg) + { + version (LDC) { - version (LDC) - { - storeUnaligned!int4(reg, cast(int*) dest); - } - else - { - storeUnaligned(cast(void16*) dest, reg); - } + storeUnaligned!int4(reg, cast(int*) dest); } - const uint v = val * 0x01010101; // Broadcast c to all 4 bytes - // NOTE(stefanos): I use the naive version, which in my benchmarks was slower - // than the previous classic switch. BUT. Using the switch had a significant - // drop in the rest of the sizes. It's not the branch that is responsible for the drop, - // but the fact that it's more difficult to optimize it as part of the rest of the code. - if (n <= 16) + else { - DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n); - return; + storeUnaligned(cast(void16*) dest, reg); } - void *temp = d + n - 0x10; // Used for the last 32 bytes - // Broadcast v to all bytes. - auto xmm0 = int4(v); - ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. - // Store 16 bytes, from which some will possibly overlap on a future store. - // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, - // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most - // 16, we store 16 bytes anyway. - store16i_sse(d, xmm0); - d += 16 - rem; - n -= 16 - rem; - // Move in blocks of 32. - // TODO(stefanos): Experiment with differnt sizes. - if (n >= 32) + } + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes + // NOTE(stefanos): I use the naive version, which in my benchmarks was slower + // than the previous classic switch. BUT. Using the switch had a significant + // drop in the rest of the sizes. It's not the branch that is responsible for the drop, + // but the fact that it's more difficult to optimize it as part of the rest of the code. + if (n <= 16) + { + DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n); + return; + } + void *temp = d + n - 0x10; // Used for the last 32 bytes + // Broadcast v to all bytes. + auto xmm0 = int4(v); + ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. + // Store 16 bytes, from which some will possibly overlap on a future store. + // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned, + // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most + // 16, we store 16 bytes anyway. + store16i_sse(d, xmm0); + d += 16 - rem; + n -= 16 - rem; + // Move in blocks of 32. + // TODO(stefanos): Experiment with differnt sizes. + if (n >= 32) + { + // Align to (previous) multiple of 32. That does something invisible to the code, + // but a good optimizer will avoid a `cmp` instruction inside the loop. With a + // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): + // sub RDX, 32; + // jge START_OF_THE_LOOP. + // Without that, it has to be: + // sub RDX, 32; + // cmp RDX, 32; + // jge START_OF_THE_LOOP + // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means + // we have somehow to compensate for that, which is done at the end of this function. + n &= -32; + do { - // Align to (previous) multiple of 32. That does something invisible to the code, - // but a good optimizer will avoid a `cmp` instruction inside the loop. With a - // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX): - // sub RDX, 32; - // jge START_OF_THE_LOOP. - // Without that, it has to be: - // sub RDX, 32; - // cmp RDX, 32; - // jge START_OF_THE_LOOP - // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means - // we have somehow to compensate for that, which is done at the end of this function. - n &= -32; - do - { - store32i_sse(d, xmm0); - // NOTE(stefanos): I tried avoiding this operation on `d` by combining - // `d` and `n` in the above loop and going backwards. It was slower in my benchs. - d += 32; - n -= 32; - } while (n >= 32); - } - // Compensate for the last (at most) 32 bytes. - store32i_sse(temp-0x10, xmm0); + store32i_sse(d, xmm0); + // NOTE(stefanos): I tried avoiding this operation on `d` by combining + // `d` and `n` in the above loop and going backwards. It was slower in my benchs. + d += 32; + n -= 32; + } while (n >= 32); } + // Compensate for the last (at most) 32 bytes. + store32i_sse(temp-0x10, xmm0); + } + +} +else +{ + private void Dmemset(void *d, const uint val, size_t n) + { + memsetNaive(d, val, n); } + } -private void DmemsetNaive(void *dst, const ubyte val, size_t n) +/* Naive implementation + */ +private void memsetNaive(void *dst, const ubyte val, size_t n) { ubyte *d = cast(ubyte*) dst; foreach (i; 0 .. n) @@ -161,6 +158,7 @@ private void DmemsetNaive(void *dst, const ubyte val, size_t n) } } + /** Core features tests. */ unittest From a161b98ecb83e91389190d73f16e103182b3f3f6 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 17:15:13 +0300 Subject: [PATCH 12/29] Move std.traits code to core.internal.traits --- src/core/experimental/memutils.d | 66 +--------------- src/core/internal/traits.d | 125 +++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 65 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index cffa0d4fdb..52cea7f555 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -21,6 +21,7 @@ module core.experimental.memutils; void memset(T)(ref T dst, const ubyte val) { + import core.internal.traits : isArray; const uint v = cast(uint) val; static if (isArray!T) { @@ -144,7 +145,6 @@ else { memsetNaive(d, val, n); } - } /* Naive implementation @@ -177,67 +177,3 @@ unittest assert(p[i] == 9); } } - - -/** Handy std.traits code, directly copied from there. - */ -import core.internal.traits : Unqual; - -package template ModifyTypePreservingTQ(alias Modifier, T) -{ - static if (is(T U == immutable U)) alias ModifyTypePreservingTQ = immutable Modifier!U; - else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U; - else static if (is(T U == shared inout U)) alias ModifyTypePreservingTQ = shared inout Modifier!U; - else static if (is(T U == shared const U)) alias ModifyTypePreservingTQ = shared const Modifier!U; - else static if (is(T U == shared U)) alias ModifyTypePreservingTQ = shared Modifier!U; - else static if (is(T U == inout const U)) alias ModifyTypePreservingTQ = inout const Modifier!U; - else static if (is(T U == inout U)) alias ModifyTypePreservingTQ = inout Modifier!U; - else static if (is(T U == const U)) alias ModifyTypePreservingTQ = const Modifier!U; - else alias ModifyTypePreservingTQ = Modifier!T; -} - -template OriginalType(T) -{ - template Impl(T) - { - static if (is(T U == enum)) alias Impl = OriginalType!U; - else alias Impl = T; - } - - alias OriginalType = ModifyTypePreservingTQ!(Impl, T); -} - -enum bool isAggregateType(T) = is(T == struct) || is(T == union) || - is(T == class) || is(T == interface); - -private template AliasThisTypeOf(T) -if (isAggregateType!T) -{ - alias members = __traits(getAliasThis, T); - - static if (members.length == 1) - { - alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0])); - } - else - static assert(0, T.stringof~" does not have alias this type"); -} - -template DynamicArrayTypeOf(T) -{ - static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT)) - alias X = DynamicArrayTypeOf!AT; - else - alias X = OriginalType!T; - - static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; }))) - { - alias DynamicArrayTypeOf = X; - } - else - static assert(0, T.stringof~" is not a dynamic array"); -} - -enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; -enum bool isStaticArray(T) = __traits(isStaticArray, T); -enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d index bccf1ad356..089346e251 100644 --- a/src/core/internal/traits.d +++ b/src/core/internal/traits.d @@ -567,3 +567,128 @@ if (func.length == 1 /*&& isCallable!func*/) static assert(P_dglit.length == 1); static assert(is(P_dglit[0] == int)); } + +// [For internal use] +package template ModifyTypePreservingTQ(alias Modifier, T) +{ + static if (is(T U == immutable U)) alias ModifyTypePreservingTQ = immutable Modifier!U; + else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U; + else static if (is(T U == shared inout U)) alias ModifyTypePreservingTQ = shared inout Modifier!U; + else static if (is(T U == shared const U)) alias ModifyTypePreservingTQ = shared const Modifier!U; + else static if (is(T U == shared U)) alias ModifyTypePreservingTQ = shared Modifier!U; + else static if (is(T U == inout const U)) alias ModifyTypePreservingTQ = inout const Modifier!U; + else static if (is(T U == inout U)) alias ModifyTypePreservingTQ = inout Modifier!U; + else static if (is(T U == const U)) alias ModifyTypePreservingTQ = const Modifier!U; + else alias ModifyTypePreservingTQ = Modifier!T; +} + +@safe unittest +{ + alias Intify(T) = int; + static assert(is(ModifyTypePreservingTQ!(Intify, real) == int)); + static assert(is(ModifyTypePreservingTQ!(Intify, const real) == const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, inout real) == inout int)); + static assert(is(ModifyTypePreservingTQ!(Intify, inout const real) == inout const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared real) == shared int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared const real) == shared const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared inout real) == shared inout int)); + static assert(is(ModifyTypePreservingTQ!(Intify, shared inout const real) == shared inout const int)); + static assert(is(ModifyTypePreservingTQ!(Intify, immutable real) == immutable int)); +} + +/** + * Strips off all `enum`s from type `T`. + */ +template OriginalType(T) +{ + template Impl(T) + { + static if (is(T U == enum)) alias Impl = OriginalType!U; + else alias Impl = T; + } + + alias OriginalType = ModifyTypePreservingTQ!(Impl, T); +} + +/// +@safe unittest +{ + enum E : real { a = 0 } // NOTE: explicit initialization to 0 required during Enum init deprecation cycle + enum F : E { a = E.a } + alias G = const(F); + static assert(is(OriginalType!E == real)); + static assert(is(OriginalType!F == real)); + static assert(is(OriginalType!G == const real)); +} + +/** + * Detect whether type `T` is an aggregate type. + */ +enum bool isAggregateType(T) = is(T == struct) || is(T == union) || + is(T == class) || is(T == interface); + +private template AliasThisTypeOf(T) +if (isAggregateType!T) +{ + alias members = __traits(getAliasThis, T); + + static if (members.length == 1) + { + alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0])); + } + else + static assert(0, T.stringof~" does not have alias this type"); +} + +/* + */ +template DynamicArrayTypeOf(T) +{ + static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT)) + alias X = DynamicArrayTypeOf!AT; + else + alias X = OriginalType!T; + + static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; }))) + { + alias DynamicArrayTypeOf = X; + } + else + static assert(0, T.stringof~" is not a dynamic array"); +} + +@safe unittest +{ + static foreach (T; AliasSeq!(/*void, */bool, NumericTypeList, /*ImaginaryTypeList, ComplexTypeList*/)) + static foreach (Q; AliasSeq!(TypeQualifierList, InoutOf, SharedInoutOf)) + { + static assert(is( Q!T[] == DynamicArrayTypeOf!( Q!T[] ) )); + static assert(is( Q!(T[]) == DynamicArrayTypeOf!( Q!(T[]) ) )); + + static foreach (P; AliasSeq!(MutableOf, ConstOf, ImmutableOf)) + { + static assert(is( Q!(P!T[]) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!T[])) ) )); + static assert(is( Q!(P!(T[])) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!(T[]))) ) )); + } + } + + static assert(!is(DynamicArrayTypeOf!(int[3]))); + static assert(!is(DynamicArrayTypeOf!(void[3]))); + static assert(!is(DynamicArrayTypeOf!(typeof(null)))); +} + +/** + * Detect whether type `T` is a dynamic array. + */ +enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T; + +/** + * Detect whether type `T` is an array (static or dynamic; for associative + * arrays see $(LREF isAssociativeArray)). + */ +enum bool isArray(T) = isStaticArray!T || isDynamicArray!T; + +/** + * Detect whether type `T` is a static array. + */ +enum bool isStaticArray(T) = __traits(isStaticArray, T); From 5da39a9345bf90f28060d5982581ff066e4d2e69 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 17:17:20 +0300 Subject: [PATCH 13/29] Naming fix --- src/core/experimental/memutils.d | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 52cea7f555..ab83e4f2b9 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -95,7 +95,7 @@ version (D_SIMD) // but the fact that it's more difficult to optimize it as part of the rest of the code. if (n <= 16) { - DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n); + memsetNaive(cast(ubyte*) d, cast(ubyte) val, n); return; } void *temp = d + n - 0x10; // Used for the last 32 bytes From cc6d019d3f72bdb467b5671d9642fa8c2bef53a2 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 17:22:04 +0300 Subject: [PATCH 14/29] Fix in using non-existent code in internal.traits unittests --- src/core/internal/traits.d | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d index 089346e251..aa331590ac 100644 --- a/src/core/internal/traits.d +++ b/src/core/internal/traits.d @@ -657,21 +657,10 @@ template DynamicArrayTypeOf(T) static assert(0, T.stringof~" is not a dynamic array"); } +// TODO(stefanos): More unit-testing. + @safe unittest { - static foreach (T; AliasSeq!(/*void, */bool, NumericTypeList, /*ImaginaryTypeList, ComplexTypeList*/)) - static foreach (Q; AliasSeq!(TypeQualifierList, InoutOf, SharedInoutOf)) - { - static assert(is( Q!T[] == DynamicArrayTypeOf!( Q!T[] ) )); - static assert(is( Q!(T[]) == DynamicArrayTypeOf!( Q!(T[]) ) )); - - static foreach (P; AliasSeq!(MutableOf, ConstOf, ImmutableOf)) - { - static assert(is( Q!(P!T[]) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!T[])) ) )); - static assert(is( Q!(P!(T[])) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!(T[]))) ) )); - } - } - static assert(!is(DynamicArrayTypeOf!(int[3]))); static assert(!is(DynamicArrayTypeOf!(void[3]))); static assert(!is(DynamicArrayTypeOf!(typeof(null)))); From 7b9eb3c9dcffef2956446bf47abd5661501e8657 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Fri, 5 Jul 2019 17:29:03 +0300 Subject: [PATCH 15/29] Fix for uint vs ubyte in memsetNaive --- src/core/experimental/memutils.d | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index ab83e4f2b9..4fe44cbd1f 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -149,12 +149,12 @@ else /* Naive implementation */ -private void memsetNaive(void *dst, const ubyte val, size_t n) +private void memsetNaive(void *dst, const uint val, size_t n) { ubyte *d = cast(ubyte*) dst; foreach (i; 0 .. n) { - d[i] = val; + d[i] = cast(ubyte)val; } } From 08d044ff7055bcfcad18c219abdbc93a11022dd7 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sat, 6 Jul 2019 14:27:36 +0300 Subject: [PATCH 16/29] Removed escaping from tests in memutils --- test/experimental/src/memutils.d | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d index 8a30fc3217..f9ed626c67 100644 --- a/test/experimental/src/memutils.d +++ b/test/experimental/src/memutils.d @@ -42,20 +42,6 @@ void main() DmemsetTestStaticArray!(ubyte, 64349)(5); } -// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk -void escape(void* p) -{ - version (LDC) - { - import ldc.llvmasm; - __asm("", "r,~{memory}", p); - } - version (GNU) - { - asm { "" : : "g" p : "memory"; } - } -} - void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v) { const ubyte *p = cast(const ubyte *) a.ptr; @@ -74,6 +60,10 @@ void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v) } } +// NOTE(stefanos): Escaping the pointers is not needed, the compiler doesn't optimize it away. +// My best guess is that this is because of the verification (i.e. if the operation is not done, +// an assert will fire and does not satisfy correctness). + void DmemsetTestDynamicArray(T)(const ubyte v, size_t n) { T[] buf; From d611a186485b6cebf04b8657aefaf82e3e772b85 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sun, 7 Jul 2019 18:40:27 +0300 Subject: [PATCH 17/29] Versioning improvement --- src/core/experimental/memutils.d | 41 ++++++++++++++------------------ 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 4fe44cbd1f..00a5058d3f 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -34,18 +34,22 @@ void memset(T)(ref T dst, const ubyte val) } } -version (GNU) +version (D_SIMD) { - private void Dmemset(void *d, const uint val, size_t n) - { - memsetNaive(d, val, n); - } + version = useSIMD; } -else -version (D_SIMD) +version (LDC) +{ + // LDC always supports SIMD and the back-end uses the most + // appropriate size for every target. + version = useSIMD; +} + +version (useSIMD) { /* SIMD implementation */ + //pragma(msg, "SIMD used"); private void Dmemset(void *d, const uint val, size_t n) { import core.simd : int4; @@ -64,30 +68,23 @@ version (D_SIMD) // TODO(stefanos): Is there a way to make them @safe? // (The problem is that for LDC, they could take int* or float* pointers // but the cast to void16 for DMD is necessary anyway). - void store32i_sse(void *dest, int4 reg) + void store16i_sse(void *dest, int4 reg) { version (LDC) { storeUnaligned!int4(reg, cast(int*) dest); - storeUnaligned!int4(reg, cast(int*) (dest+0x10)); } else { storeUnaligned(cast(void16*) dest, reg); - storeUnaligned(cast(void16*) (dest+0x10), reg); } } - void store16i_sse(void *dest, int4 reg) + void store32i_sse(void *dest, int4 reg) { - version (LDC) - { - storeUnaligned!int4(reg, cast(int*) dest); - } - else - { - storeUnaligned(cast(void16*) dest, reg); - } + store16i_sse(dest, reg); + store16i_sse(dest+0x10, reg); } + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes // NOTE(stefanos): I use the naive version, which in my benchmarks was slower // than the previous classic switch. BUT. Using the switch had a significant @@ -137,18 +134,17 @@ version (D_SIMD) // Compensate for the last (at most) 32 bytes. store32i_sse(temp-0x10, xmm0); } - } else { + /* Forward to simple implementation. + */ private void Dmemset(void *d, const uint val, size_t n) { memsetNaive(d, val, n); } } -/* Naive implementation - */ private void memsetNaive(void *dst, const uint val, size_t n) { ubyte *d = cast(ubyte*) dst; @@ -158,7 +154,6 @@ private void memsetNaive(void *dst, const uint val, size_t n) } } - /** Core features tests. */ unittest From 00ca80a4ae50aa2d25dd6a86b6cbf0242140c29c Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sun, 7 Jul 2019 21:29:09 +0300 Subject: [PATCH 18/29] GDC SIMD version and bug fix --- src/core/experimental/memutils.d | 86 +++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 00a5058d3f..da1b6aa9e9 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -13,15 +13,16 @@ module core.experimental.memutils; */ /* - If T is an array,set all `dst`'s bytes + If T is an array, set all `dst`'s bytes (whose count is the length of the array times the size of the array element) to `val`. Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. */ - -void memset(T)(ref T dst, const ubyte val) +// This is named Dmemset (contrary to the D runtime +// PR where it's named memset()) for clear disambiguation with the libc memset(). +void Dmemset(T)(ref T dst, const ubyte val) { - import core.internal.traits : isArray; + import std.traits : isArray; const uint v = cast(uint) val; static if (isArray!T) { @@ -38,10 +39,16 @@ version (D_SIMD) { version = useSIMD; } -version (LDC) +else version (LDC) +{ + // LDC always supports SIMD (but doesn't ever set D_SIMD) and + // the back-end uses the most appropriate size for every target. + version = useSIMD; +} +else version (GNU) { - // LDC always supports SIMD and the back-end uses the most - // appropriate size for every target. + // GNU does not support SIMD by default. We have to do more complicated + // stuff below. So we start by default with useSIMD and decide later. version = useSIMD; } @@ -50,34 +57,75 @@ version (useSIMD) /* SIMD implementation */ //pragma(msg, "SIMD used"); - private void Dmemset(void *d, const uint val, size_t n) + extern(C) private void Dmemset(void *d, const uint val, size_t n) { import core.simd : int4; version (LDC) { + enum gdcSIMD = false; import ldc.simd : loadUnaligned, storeUnaligned; } else version (DigitalMars) { import core.simd : void16, loadUnaligned, storeUnaligned; } - else + else version (GNU) { - static assert(0, "Only DMD / LDC are supported"); + // NOTE(stefanos): I could not combine GDC versioning in `useSIMD`. + // To know if we can use SIMD for GDC is more complex. We need to: + // - Be in x86 arch since the intrinsics (builtins) are only x86 specific. + // - Compile the int4 vector size. + // TODO(stefanos): The GCC specification points that to use the store intrinsic, + // we have to be in SSE2. Is this guaranteed if `int4` compiles? + // Note that GCC builtins provide the __builtin_cpu_supports() but this is a runtime + // function. + version (X86_64) + { + enum isX86 = true; + } + else version (X86) + { + enum isX86 = true; + } + + static if (isX86 && __traits(compiles, int4)) + { + enum gdcSIMD = true; + } + else + { + memsetNaive(d, val, n); + return; + } } + // TODO(stefanos): Is there a way to make them @safe? // (The problem is that for LDC, they could take int* or float* pointers // but the cast to void16 for DMD is necessary anyway). - void store16i_sse(void *dest, int4 reg) + + static if (gdcSIMD) { - version (LDC) + import gcc.builtins; + import core.simd : ubyte16; + void store16i_sse(void *dest, int4 reg) { - storeUnaligned!int4(reg, cast(int*) dest); + __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg); } - else + } + else + { + void store16i_sse(void *dest, int4 reg) { - storeUnaligned(cast(void16*) dest, reg); + version (LDC) + { + storeUnaligned!int4(reg, cast(int*) dest); + } + else + { + storeUnaligned(cast(void16*) dest, reg); + } } + } void store32i_sse(void *dest, int4 reg) { @@ -85,17 +133,17 @@ version (useSIMD) store16i_sse(dest+0x10, reg); } - const uint v = val * 0x01010101; // Broadcast c to all 4 bytes // NOTE(stefanos): I use the naive version, which in my benchmarks was slower // than the previous classic switch. BUT. Using the switch had a significant // drop in the rest of the sizes. It's not the branch that is responsible for the drop, // but the fact that it's more difficult to optimize it as part of the rest of the code. - if (n <= 16) + if (n < 32) { memsetNaive(cast(ubyte*) d, cast(ubyte) val, n); return; } void *temp = d + n - 0x10; // Used for the last 32 bytes + const uint v = val * 0x01010101; // Broadcast c to all 4 bytes // Broadcast v to all bytes. auto xmm0 = int4(v); ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary. @@ -159,13 +207,13 @@ private void memsetNaive(void *dst, const uint val, size_t n) unittest { ubyte[3] a; - memset(a, 7); + Dmemset(a, 7); assert(a[0] == 7); assert(a[1] == 7); assert(a[2] == 7); real b; - memset(b, 9); + Dmemset(b, 9); ubyte *p = cast(ubyte*) &b; foreach (i; 0 .. b.sizeof) { From 9ad8f16ecb4a565a0bfb329de63aba7d50067d0e Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sun, 7 Jul 2019 22:15:13 +0300 Subject: [PATCH 19/29] Not so naive version of memsetNaive --- src/core/experimental/memutils.d | 75 +++++++++++++++++++++++++++----- 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index da1b6aa9e9..d117dd25a4 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -18,11 +18,9 @@ module core.experimental.memutils; the size of the array element) to `val`. Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. */ -// This is named Dmemset (contrary to the D runtime -// PR where it's named memset()) for clear disambiguation with the libc memset(). -void Dmemset(T)(ref T dst, const ubyte val) +void memset(T)(ref T dst, const ubyte val) { - import std.traits : isArray; + import core.internal.traits : isArray; const uint v = cast(uint) val; static if (isArray!T) { @@ -35,6 +33,7 @@ void Dmemset(T)(ref T dst, const ubyte val) } } + version (D_SIMD) { version = useSIMD; @@ -67,6 +66,7 @@ version (useSIMD) } else version (DigitalMars) { + enum gdcSIMD = false; import core.simd : void16, loadUnaligned, storeUnaligned; } else version (GNU) @@ -139,7 +139,7 @@ version (useSIMD) // but the fact that it's more difficult to optimize it as part of the rest of the code. if (n < 32) { - memsetNaive(cast(ubyte*) d, cast(ubyte) val, n); + memsetNaive(d, val, n); return; } void *temp = d + n - 0x10; // Used for the last 32 bytes @@ -193,13 +193,68 @@ else } } +// NOTE(stefanos): We're using naive for the < 32 case in the SIMD version. +// To be more performant, for that case, we would have a big fall-through switch +// for all < 32 sizes. private void memsetNaive(void *dst, const uint val, size_t n) { - ubyte *d = cast(ubyte*) dst; - foreach (i; 0 .. n) + const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast val to all 8 bytes + enum handleLT16Sizes = " + switch (n) + { + case 6: + *(cast(uint*) (dst+2)) = cast(uint) v; + goto case 2; // fall-through + case 2: + *(cast(ushort*) dst) = cast(ushort) v; + return; + + case 7: + *(cast(uint*) (dst+3)) = cast(uint) v; + goto case 3; // fall-through + case 3: + *(cast(ushort*) (dst+1)) = cast(ushort) v; + goto case 1; // fall-through + case 1: + *(cast(ubyte*) dst) = cast(ubyte) v; + return; + + case 4: + *(cast(uint*) dst) = cast(uint) v; + return; + case 0: + return; + + case 5: + *(cast(uint*) (dst+1)) = cast(uint) v; + *(cast(ubyte*) dst) = cast(ubyte) v; + return; + default: + } + "; + mixin(handleLT16Sizes); + // NOTE(stefanos): Normally, we would have different alignment + // for 32-bit and 64-bit versions. For the sake of simplicity, + // we'll let the compiler do the work. + ubyte rem = cast(ubyte) dst & 7; + if (rem) + { // Unaligned + // Move 8 bytes (which we will possibly overlap later). + *(cast(ulong*) dst) = v; + // Reach alignment + dst += 8 - rem; + n -= 8 - rem; + } + ulong *d = cast(ulong*) dst; + ulong temp = n / 8; + for (size_t i = 0; i != temp; ++i) { - d[i] = cast(ubyte)val; + *d = v; + ++d; // += 8 + n -= 8; } + dst = cast(void *) d; + mixin(handleLT16Sizes); } /** Core features tests. @@ -207,13 +262,13 @@ private void memsetNaive(void *dst, const uint val, size_t n) unittest { ubyte[3] a; - Dmemset(a, 7); + memset(a, 7); assert(a[0] == 7); assert(a[1] == 7); assert(a[2] == 7); real b; - Dmemset(b, 9); + memset(b, 9); ubyte *p = cast(ubyte*) &b; foreach (i; 0 .. b.sizeof) { From 504fc7bbe80dbff527e783d148adb1153839689e Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Mon, 29 Jul 2019 23:05:11 +0300 Subject: [PATCH 20/29] mixin removal in memsetNaive --- src/core/experimental/memutils.d | 81 ++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index d117dd25a4..17d5d66c02 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -33,7 +33,6 @@ void memset(T)(ref T dst, const ubyte val) } } - version (D_SIMD) { version = useSIMD; @@ -56,7 +55,7 @@ version (useSIMD) /* SIMD implementation */ //pragma(msg, "SIMD used"); - extern(C) private void Dmemset(void *d, const uint val, size_t n) + private void Dmemset(void *d, const uint val, size_t n) { import core.simd : int4; version (LDC) @@ -198,41 +197,50 @@ else // for all < 32 sizes. private void memsetNaive(void *dst, const uint val, size_t n) { - const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast val to all 8 bytes - enum handleLT16Sizes = " - switch (n) + // NOTE(stefanos): DMD could not inline it. + void handleLT16Sizes(void *d, const ulong v, size_t n) { - case 6: - *(cast(uint*) (dst+2)) = cast(uint) v; - goto case 2; // fall-through - case 2: - *(cast(ushort*) dst) = cast(ushort) v; - return; + switch (n) + { + case 6: + *(cast(uint*) (d+2)) = cast(uint) v; + goto case 2; // fall-through + case 2: + *(cast(ushort*) d) = cast(ushort) v; + return; - case 7: - *(cast(uint*) (dst+3)) = cast(uint) v; - goto case 3; // fall-through - case 3: - *(cast(ushort*) (dst+1)) = cast(ushort) v; - goto case 1; // fall-through - case 1: - *(cast(ubyte*) dst) = cast(ubyte) v; - return; + case 7: + *(cast(uint*) (d+3)) = cast(uint) v; + goto case 3; // fall-through + case 3: + *(cast(ushort*) (d+1)) = cast(ushort) v; + goto case 1; // fall-through + case 1: + *(cast(ubyte*) d) = cast(ubyte) v; + return; - case 4: - *(cast(uint*) dst) = cast(uint) v; - return; - case 0: - return; + case 4: + *(cast(uint*) d) = cast(uint) v; + return; + case 0: + return; - case 5: - *(cast(uint*) (dst+1)) = cast(uint) v; - *(cast(ubyte*) dst) = cast(ubyte) v; - return; - default: + case 5: + *(cast(uint*) (d+1)) = cast(uint) v; + *(cast(ubyte*) d) = cast(ubyte) v; + return; + default: + } + } + + + const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast c to all 8 bytes + if (n < 8) + { + handleLT16Sizes(dst, v, n); + return; } - "; - mixin(handleLT16Sizes); + // NOTE(stefanos): Normally, we would have different alignment // for 32-bit and 64-bit versions. For the sake of simplicity, // we'll let the compiler do the work. @@ -241,22 +249,23 @@ private void memsetNaive(void *dst, const uint val, size_t n) { // Unaligned // Move 8 bytes (which we will possibly overlap later). *(cast(ulong*) dst) = v; - // Reach alignment dst += 8 - rem; n -= 8 - rem; } ulong *d = cast(ulong*) dst; ulong temp = n / 8; - for (size_t i = 0; i != temp; ++i) + for(size_t i = 0; i != temp; ++i) { *d = v; - ++d; // += 8 + ++d; n -= 8; } dst = cast(void *) d; - mixin(handleLT16Sizes); + + handleLT16Sizes(dst, v, n); } + /** Core features tests. */ unittest From 9af240fb7821a51259480008e75f11b2258b55c1 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Mon, 29 Jul 2019 23:07:50 +0300 Subject: [PATCH 21/29] Style fix --- src/core/experimental/memutils.d | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 17d5d66c02..3f49e6ffb4 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -240,7 +240,6 @@ private void memsetNaive(void *dst, const uint val, size_t n) handleLT16Sizes(dst, v, n); return; } - // NOTE(stefanos): Normally, we would have different alignment // for 32-bit and 64-bit versions. For the sake of simplicity, // we'll let the compiler do the work. @@ -254,7 +253,7 @@ private void memsetNaive(void *dst, const uint val, size_t n) } ulong *d = cast(ulong*) dst; ulong temp = n / 8; - for(size_t i = 0; i != temp; ++i) + for (size_t i = 0; i != temp; ++i) { *d = v; ++d; From 08ffa2c56c1aa1f90d533973e645be8106dd2823 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 12:57:47 +0300 Subject: [PATCH 22/29] Doc fix --- src/core/experimental/memutils.d | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 3f49e6ffb4..17267a93b9 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -4,14 +4,6 @@ */ module core.experimental.memutils; -/** memset() implementation */ - -/** - * NOTE(stefanos): - * Range-checking is not needed since the user never - * pass an `n` (byte count) directly. - */ - /* If T is an array, set all `dst`'s bytes (whose count is the length of the array times @@ -54,7 +46,6 @@ version (useSIMD) { /* SIMD implementation */ - //pragma(msg, "SIMD used"); private void Dmemset(void *d, const uint val, size_t n) { import core.simd : int4; @@ -154,7 +145,6 @@ version (useSIMD) d += 16 - rem; n -= 16 - rem; // Move in blocks of 32. - // TODO(stefanos): Experiment with differnt sizes. if (n >= 32) { // Align to (previous) multiple of 32. That does something invisible to the code, @@ -192,9 +182,9 @@ else } } -// NOTE(stefanos): We're using naive for the < 32 case in the SIMD version. -// To be more performant, for that case, we would have a big fall-through switch -// for all < 32 sizes. +/* + Naive version for when there isn't any vector support (SIMD etc.). +*/ private void memsetNaive(void *dst, const uint val, size_t n) { // NOTE(stefanos): DMD could not inline it. @@ -253,6 +243,7 @@ private void memsetNaive(void *dst, const uint val, size_t n) } ulong *d = cast(ulong*) dst; ulong temp = n / 8; + // Go in steps of 8 - the register size in x86_64. for (size_t i = 0; i != temp; ++i) { *d = v; From ff8121929f14c4d875276b989e70152bf8009f78 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 13:04:29 +0300 Subject: [PATCH 23/29] SIMD versioning improvement --- src/core/experimental/memutils.d | 85 +++++++++++--------------------- 1 file changed, 30 insertions(+), 55 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 17267a93b9..64e185074b 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -27,19 +27,37 @@ void memset(T)(ref T dst, const ubyte val) version (D_SIMD) { - version = useSIMD; + import core.simd : float4; + enum useSIMD = true; } else version (LDC) { // LDC always supports SIMD (but doesn't ever set D_SIMD) and // the back-end uses the most appropriate size for every target. - version = useSIMD; + import core.simd : float4; + enum useSIMD = true; } else version (GNU) { - // GNU does not support SIMD by default. We have to do more complicated - // stuff below. So we start by default with useSIMD and decide later. - version = useSIMD; + import core.simd : float4; + // GNU does not support SIMD by default. + version (X86_64) + { + private enum isX86 = true; + } + else version (X86) + { + private enum isX86 = true; + } + + static if (isX86 && __traits(compiles, int4)) + { + enum useSIMD = true; + } + else + { + enum useSIMD = false; + } } version (useSIMD) @@ -51,49 +69,21 @@ version (useSIMD) import core.simd : int4; version (LDC) { - enum gdcSIMD = false; import ldc.simd : loadUnaligned, storeUnaligned; + void store16i_sse(void *dest, int4 reg) + { + storeUnaligned!int4(reg, cast(int*) dest); + } } else version (DigitalMars) { - enum gdcSIMD = false; import core.simd : void16, loadUnaligned, storeUnaligned; - } - else version (GNU) - { - // NOTE(stefanos): I could not combine GDC versioning in `useSIMD`. - // To know if we can use SIMD for GDC is more complex. We need to: - // - Be in x86 arch since the intrinsics (builtins) are only x86 specific. - // - Compile the int4 vector size. - // TODO(stefanos): The GCC specification points that to use the store intrinsic, - // we have to be in SSE2. Is this guaranteed if `int4` compiles? - // Note that GCC builtins provide the __builtin_cpu_supports() but this is a runtime - // function. - version (X86_64) - { - enum isX86 = true; - } - else version (X86) - { - enum isX86 = true; - } - - static if (isX86 && __traits(compiles, int4)) - { - enum gdcSIMD = true; - } - else + void store16i_sse(void *dest, int4 reg) { - memsetNaive(d, val, n); - return; + storeUnaligned(cast(void16*) dest, reg); } } - - // TODO(stefanos): Is there a way to make them @safe? - // (The problem is that for LDC, they could take int* or float* pointers - // but the cast to void16 for DMD is necessary anyway). - - static if (gdcSIMD) + else { import gcc.builtins; import core.simd : ubyte16; @@ -102,21 +92,6 @@ version (useSIMD) __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg); } } - else - { - void store16i_sse(void *dest, int4 reg) - { - version (LDC) - { - storeUnaligned!int4(reg, cast(int*) dest); - } - else - { - storeUnaligned(cast(void16*) dest, reg); - } - } - - } void store32i_sse(void *dest, int4 reg) { store16i_sse(dest, reg); From 4e6654b6d7c05407d39b986f6a536bfd774eb645 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 17:32:23 +0300 Subject: [PATCH 24/29] Doc improvement --- src/core/experimental/memutils.d | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 64e185074b..11df0f22ff 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -4,12 +4,12 @@ */ module core.experimental.memutils; -/* - If T is an array, set all `dst`'s bytes - (whose count is the length of the array times - the size of the array element) to `val`. - Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. -*/ +/** + * If T is an array, set all `dst`'s bytes + * (whose count is the length of the array times + * the size of the array element) to `val`. + * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. + */ void memset(T)(ref T dst, const ubyte val) { import core.internal.traits : isArray; @@ -97,7 +97,6 @@ version (useSIMD) store16i_sse(dest, reg); store16i_sse(dest+0x10, reg); } - // NOTE(stefanos): I use the naive version, which in my benchmarks was slower // than the previous classic switch. BUT. Using the switch had a significant // drop in the rest of the sizes. It's not the branch that is responsible for the drop, @@ -157,8 +156,7 @@ else } } -/* - Naive version for when there isn't any vector support (SIMD etc.). +/* Naive version for when there isn't any vector support (SIMD etc.). */ private void memsetNaive(void *dst, const uint val, size_t n) { From d7b8a0b83f9e0777b650e8dbc744db311297a41a Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 17:37:33 +0300 Subject: [PATCH 25/29] Doc improvement 2 --- src/core/experimental/memutils.d | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 11df0f22ff..76638f501f 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -9,6 +9,13 @@ module core.experimental.memutils; * (whose count is the length of the array times * the size of the array element) to `val`. * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. + * + * Params + * val = The byte with which we want to fill memory with. + * dst = Memory Destination whose bytes are to be set to `val`. + * + * Returns: + * Nothing. */ void memset(T)(ref T dst, const ubyte val) { From 83541f783aee82fceacc4dfe4da6be04585d2cc2 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 17:47:30 +0300 Subject: [PATCH 26/29] Minor changes --- src/core/experimental/memutils.d | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 76638f501f..5e52ac25b9 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -9,8 +9,8 @@ module core.experimental.memutils; * (whose count is the length of the array times * the size of the array element) to `val`. * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. - * - * Params + * + * Params: * val = The byte with which we want to fill memory with. * dst = Memory Destination whose bytes are to be set to `val`. * From b9bc30c652eb2f52912e6c7800b6be2195f25316 Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Tue, 30 Jul 2019 17:52:55 +0300 Subject: [PATCH 27/29] Changed Returns to N.B. --- src/core/experimental/memutils.d | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 5e52ac25b9..51bbc1662d 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -9,13 +9,11 @@ module core.experimental.memutils; * (whose count is the length of the array times * the size of the array element) to `val`. * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`. + * N.B.: Contrary to the C Standard Library memset(), this functions returns nothing. * * Params: * val = The byte with which we want to fill memory with. * dst = Memory Destination whose bytes are to be set to `val`. - * - * Returns: - * Nothing. */ void memset(T)(ref T dst, const ubyte val) { From 1204a8b9d3caafca755902cac69836a56df07bbc Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sat, 3 Aug 2019 15:18:45 +0300 Subject: [PATCH 28/29] Add test for empty array --- src/core/experimental/memutils.d | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 51bbc1662d..44ef7e3431 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -251,4 +251,8 @@ unittest { assert(p[i] == 9); } + + // Verify that it does not crash on empty array. + ubyte[0] c; + memset(c, 9); } From a4c7a8d5486e01fb1571db46fbf0aa3c82c32efd Mon Sep 17 00:00:00 2001 From: Stefanos Baziotis Date: Sat, 3 Aug 2019 15:22:52 +0300 Subject: [PATCH 29/29] Added @nogc nothrow --- src/core/experimental/memutils.d | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d index 44ef7e3431..6b5735b9f1 100644 --- a/src/core/experimental/memutils.d +++ b/src/core/experimental/memutils.d @@ -15,7 +15,7 @@ module core.experimental.memutils; * val = The byte with which we want to fill memory with. * dst = Memory Destination whose bytes are to be set to `val`. */ -void memset(T)(ref T dst, const ubyte val) +void memset(T)(ref T dst, const ubyte val) nothrow @nogc { import core.internal.traits : isArray; const uint v = cast(uint) val; @@ -69,13 +69,13 @@ version (useSIMD) { /* SIMD implementation */ - private void Dmemset(void *d, const uint val, size_t n) + private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc { import core.simd : int4; version (LDC) { import ldc.simd : loadUnaligned, storeUnaligned; - void store16i_sse(void *dest, int4 reg) + void store16i_sse(void *dest, int4 reg) nothrow @nogc { storeUnaligned!int4(reg, cast(int*) dest); } @@ -83,7 +83,7 @@ version (useSIMD) else version (DigitalMars) { import core.simd : void16, loadUnaligned, storeUnaligned; - void store16i_sse(void *dest, int4 reg) + void store16i_sse(void *dest, int4 reg) nothrow @nogc { storeUnaligned(cast(void16*) dest, reg); } @@ -92,12 +92,12 @@ version (useSIMD) { import gcc.builtins; import core.simd : ubyte16; - void store16i_sse(void *dest, int4 reg) + void store16i_sse(void *dest, int4 reg) nothrow @nogc { __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg); } } - void store32i_sse(void *dest, int4 reg) + void store32i_sse(void *dest, int4 reg) nothrow @nogc { store16i_sse(dest, reg); store16i_sse(dest+0x10, reg); @@ -155,7 +155,7 @@ else { /* Forward to simple implementation. */ - private void Dmemset(void *d, const uint val, size_t n) + private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc { memsetNaive(d, val, n); } @@ -163,7 +163,7 @@ else /* Naive version for when there isn't any vector support (SIMD etc.). */ -private void memsetNaive(void *dst, const uint val, size_t n) +private void memsetNaive(void *dst, const uint val, size_t n) nothrow @nogc { // NOTE(stefanos): DMD could not inline it. void handleLT16Sizes(void *d, const ulong v, size_t n)