From 847140e0f8e7a150a5178bb8fd5ba443f025aa7e Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Thu, 4 Jul 2019 22:40:05 +0300
Subject: [PATCH 01/29] memutils: Replacement of libc string.h functions -
 currently only Dmemset()

---
 mak/COPY                         |   2 +
 mak/DOCS                         |   2 +
 mak/SRCS                         |   2 +
 mak/WINDOWS                      |   3 +
 src/core/experimental/memutils.d | 300 +++++++++++++++++++++++++++++++
 5 files changed, 309 insertions(+)
 create mode 100644 src/core/experimental/memutils.d

diff --git a/mak/COPY b/mak/COPY
index 4c1719a041..6086012578 100644
--- a/mak/COPY
+++ b/mak/COPY
@@ -21,6 +21,8 @@ COPY=\
 	$(IMPDIR)\core\time.d \
 	$(IMPDIR)\core\vararg.d \
 	\
+	$(IMPDIR)\core\experimental\memutils.d \
+    \
 	$(IMPDIR)\core\internal\abort.d \
 	$(IMPDIR)\core\internal\arrayop.d \
 	$(IMPDIR)\core\internal\convert.d \
diff --git a/mak/DOCS b/mak/DOCS
index fa49be8963..c5ea44bcc2 100644
--- a/mak/DOCS
+++ b/mak/DOCS
@@ -19,6 +19,8 @@ DOCS=\
 	$(DOCDIR)\core_gc_config.html \
 	$(DOCDIR)\core_gc_gcinterface.html \
 	$(DOCDIR)\core_gc_registry.html \
+    \
+	$(DOCDIR)\core_experimental_memutils.html \
 	\
 	$(DOCDIR)\core_stdc_assert_.html \
 	$(DOCDIR)\core_stdc_config.html \
diff --git a/mak/SRCS b/mak/SRCS
index 309ca0f8d4..9d9d897cb0 100644
--- a/mak/SRCS
+++ b/mak/SRCS
@@ -16,6 +16,8 @@ SRCS=\
 	src\core\thread.d \
 	src\core\time.d \
 	src\core\vararg.d \
+    \
+	src\core\experimental\memutils.d \
 	\
 	src\core\gc\config.d \
 	src\core\gc\gcinterface.d \
diff --git a/mak/WINDOWS b/mak/WINDOWS
index 8fc6f78e14..2d46889566 100644
--- a/mak/WINDOWS
+++ b/mak/WINDOWS
@@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d
 $(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d
 	copy $** $@
 
+$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d
+	copy $** $@
+
 $(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
 	copy $** $@
 
diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
new file mode 100644
index 0000000000..df41547aeb
--- /dev/null
+++ b/src/core/experimental/memutils.d
@@ -0,0 +1,300 @@
+/**
+ * Pure D replacement of the C Standard Library basic memory building blocks of string.h
+ *
+ * Source: $(DRUNTIMESRC core/experimental/memutils.d)
+ */
+
+module core.experimental.memutils;
+
+unittest
+{
+    Dmemset_testStaticType!(byte)(5);
+    Dmemset_testStaticType!(ubyte)(5);
+    Dmemset_testStaticType!(short)(5);
+    Dmemset_testStaticType!(ushort)(5);
+    Dmemset_testStaticType!(int)(5);
+    Dmemset_testStaticType!(uint)(5);
+    Dmemset_testStaticType!(long)(5);
+    Dmemset_testStaticType!(ulong)(5);
+    Dmemset_testStaticType!(float)(5);
+    Dmemset_testStaticType!(double)(5);
+    Dmemset_testStaticType!(real)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 3);
+    static foreach(i; 1..10) {
+        Dmemset_testDynamicArray!(ubyte)(5, 2^^i);
+        Dmemset_testStaticArray!(ubyte, 2^^i)(5);
+    }
+    Dmemset_testDynamicArray!(ubyte)(5, 100);
+    Dmemset_testStaticArray!(ubyte, 100)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 500);
+    Dmemset_testStaticArray!(ubyte, 500)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 700);
+    Dmemset_testStaticArray!(ubyte, 700)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 3434);
+    Dmemset_testStaticArray!(ubyte, 3434)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 7128);
+    Dmemset_testStaticArray!(ubyte, 7128)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 13908);
+    Dmemset_testStaticArray!(ubyte, 13908)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 16343);
+    Dmemset_testStaticArray!(ubyte, 16343)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 27897);
+    Dmemset_testStaticArray!(ubyte, 27897)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 32344);
+    Dmemset_testStaticArray!(ubyte, 32344)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 46830);
+    Dmemset_testStaticArray!(ubyte, 46830)(5);
+    Dmemset_testDynamicArray!(ubyte)(5, 64349);
+    Dmemset_testStaticArray!(ubyte, 64349)(5);
+}
+
+// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
+void escape(void* p)
+{
+    version(LDC)
+    {
+        import ldc.llvmasm;
+        __asm("", "r,~{memory}", p);
+    }
+    version(GNU)
+    {
+        asm { "" : : "g" p : "memory"; }
+    }
+}
+
+void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) a.ptr;
+    for(size_t i = 0; i < a.length * T.sizeof; i++)
+    {
+        assert(p[i] == v);
+    }
+}
+
+void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) &t;
+    for(size_t i = 0; i < T.sizeof; i++)
+    {
+        assert(p[i] == v);
+    }
+}
+
+void Dmemset_testDynamicArray(T)(const ubyte v, size_t n)
+{
+    T[] buf;
+    buf.length = n + 32;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach(i; 0..alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        Dmemset_verifyArray(i, d, v);
+    }
+}
+
+void Dmemset_testStaticArray(T, size_t n)(const ubyte v)
+{
+    T[n + 32] buf;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach(i; 0..alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        Dmemset_verifyArray(i, d, v);
+    }
+}
+
+void Dmemset_testStaticType(T)(const ubyte v)
+{
+    T t;
+    escape(&t);
+    Dmemset(t, v);
+    Dmemset_verifyStaticType(t, v);
+}
+
+version (GNU)
+{
+    void Dmemset(void *d, const uint val, size_t n)
+    {
+        Dmemset_naive(d, cast(const(ubyte))val, n);
+    }
+}
+else
+{
+    // NOTE(stefanos): I could not a GDC respective of the intrinsics.
+    void Dmemset(void *d, const uint val, size_t n)
+    {
+        import core.simd: int4;
+        version (LDC)
+        {
+            import ldc.simd: loadUnaligned, storeUnaligned;
+        }
+        else
+        version (DigitalMars)
+        {
+            import core.simd: void16, loadUnaligned, storeUnaligned;
+        }
+        else
+        {
+            static assert(0, "Only DMD / LDC are supported");
+        }
+        
+        // TODO(stefanos): Is there a way to make them @safe?
+        // (The problem is that for LDC, they could take int* or float* pointers
+        // but the cast to void16 for DMD is necessary anyway).
+
+        /// Integer ///
+
+        void store32i_sse(void *dest, int4 reg)
+        {
+            version (LDC)
+            {
+                storeUnaligned!int4(reg, cast(int*)dest);
+                storeUnaligned!int4(reg, cast(int*)(dest+0x10));
+            }
+            else
+            {
+                storeUnaligned(cast(void16*)dest, reg);
+                storeUnaligned(cast(void16*)(dest+0x10), reg);
+            }
+        }
+        
+        void store16i_sse(void *dest, int4 reg)
+        {
+            version (LDC)
+            {
+                storeUnaligned!int4(reg, cast(int*)dest);
+            }
+            else
+            {
+                storeUnaligned(cast(void16*)dest, reg);
+            }
+        }
+        
+        // TODO(stefanos): Can we broadcast an int in a float4? That would be useful
+        // because then we would use only the float versions.
+        void broadcast_int(ref int4 xmm, int v)
+        {
+            xmm[0] = v;
+            xmm[1] = v;
+            xmm[2] = v;
+            xmm[3] = v;
+        }
+        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
+    
+        // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
+        // than the previous classic switch. BUT. Using the switch had a significant
+        // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
+        // but the fact that it's more difficult to optimize it as part of the rest of the code.
+        if (n <= 16)
+        {
+            Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n);
+            return;
+        }
+        void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+    
+        int4 xmm0;
+        // Broadcast v to all bytes.
+        broadcast_int(xmm0, v);
+    
+        ubyte rem = cast(ulong)d & 15;              // Remainder from the previous 16-byte boundary.
+        // Store 16 bytes, from which some will possibly overlap on a future store.
+        // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
+        // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
+        // 16, we store 16 bytes anyway.
+        store16i_sse(d, xmm0);
+        d += 16 - rem;
+        n -= 16 - rem;
+    
+        // Move in blocks of 32.
+        // TODO(stefanos): Experiment with differnt sizes.
+        if (n >= 32)
+        {
+            // Align to (previous) multiple of 32. That does something invisible to the code,
+            // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
+            // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
+            // sub RDX, 32;
+            // jge START_OF_THE_LOOP.
+            // Without that, it has to be:
+            // sub RDX, 32;
+            // cmp RDX, 32;
+            // jge START_OF_THE_LOOP
+            // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
+            // we have somehow to compensate for that, which is done at the end of this function.
+            n &= -32;
+            do
+            {
+                store32i_sse(d, xmm0);
+                // NOTE(stefanos): I tried avoiding this operation on `d` by combining
+                // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
+                d += 32;
+                n -= 32;
+            } while(n >= 32);
+        }
+        // Compensate for the last (at most) 32 bytes.
+        store32i_sse(temp-0x10, xmm0);
+    }
+}
+
+void Dmemset_naive(void *dst, const ubyte val, size_t n)
+{
+    ubyte *d = cast(ubyte*)dst;
+    for (size_t i = 0; i != n; ++i)
+    {
+        d[i] = val;
+    }
+}
+
+// NOTE(stefanos):
+// Range-checking is not needed since the user never
+// pass an `n` (byte count) directly.
+
+void Dmemset(T)(ref T dst, const ubyte val)
+{
+    import std.traits;
+    const uint v = cast(uint)val;
+    version (X86_64)
+    {
+        static if (isArray!T)
+        {
+            size_t n = dst.length * typeof(dst[0]).sizeof;
+            Dmemset(dst.ptr, v, n);
+
+            version (unittest)
+            {
+                Dmemset_naive(dst.ptr, v, n);
+            }
+        }
+        else
+        {
+            Dmemset(&dst, v, T.sizeof);
+
+            version (unittest)
+            {
+                Dmemset_naive(&dst, v, T.sizeof);
+            }
+        }
+    }
+    else
+    {
+        static if (isArray!T)
+        {
+            Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
+        }
+        else
+        {
+            Dmemset_naive(&dst, val, T.sizeof);
+        }
+    }
+}

From f991173bef05c7fdd6629798f46ab933eaac0a8d Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Thu, 4 Jul 2019 22:57:54 +0300
Subject: [PATCH 02/29] Style fix

---
 src/core/experimental/memutils.d | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index df41547aeb..758ebc25c4 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -1,9 +1,7 @@
 /**
  * Pure D replacement of the C Standard Library basic memory building blocks of string.h
- *
  * Source: $(DRUNTIMESRC core/experimental/memutils.d)
  */
-
 module core.experimental.memutils;
 
 unittest
@@ -20,7 +18,7 @@ unittest
     Dmemset_testStaticType!(double)(5);
     Dmemset_testStaticType!(real)(5);
     Dmemset_testDynamicArray!(ubyte)(5, 3);
-    static foreach(i; 1..10) {
+    static foreach (i; 1..10) {
         Dmemset_testDynamicArray!(ubyte)(5, 2^^i);
         Dmemset_testStaticArray!(ubyte, 2^^i)(5);
     }
@@ -51,12 +49,12 @@ unittest
 // From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
 void escape(void* p)
 {
-    version(LDC)
+    version (LDC)
     {
         import ldc.llvmasm;
         __asm("", "r,~{memory}", p);
     }
-    version(GNU)
+    version (GNU)
     {
         asm { "" : : "g" p : "memory"; }
     }
@@ -65,7 +63,7 @@ void escape(void* p)
 void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v)
 {
     const ubyte *p = cast(const ubyte *) a.ptr;
-    for(size_t i = 0; i < a.length * T.sizeof; i++)
+    for (size_t i = 0; i < a.length * T.sizeof; i++)
     {
         assert(p[i] == v);
     }
@@ -74,7 +72,7 @@ void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v)
 void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v)
 {
     const ubyte *p = cast(const ubyte *) &t;
-    for(size_t i = 0; i < T.sizeof; i++)
+    for (size_t i = 0; i < T.sizeof; i++)
     {
         assert(p[i] == v);
     }
@@ -88,7 +86,7 @@ void Dmemset_testDynamicArray(T)(const ubyte v, size_t n)
     enum alignments = 32;
     size_t len = n;
 
-    foreach(i; 0..alignments)
+    foreach (i; 0..alignments)
     {
         auto d = buf[i..i+n];
 
@@ -105,7 +103,7 @@ void Dmemset_testStaticArray(T, size_t n)(const ubyte v)
     enum alignments = 32;
     size_t len = n;
 
-    foreach(i; 0..alignments)
+    foreach (i; 0..alignments)
     {
         auto d = buf[i..i+n];
 
@@ -149,13 +147,9 @@ else
         {
             static assert(0, "Only DMD / LDC are supported");
         }
-        
         // TODO(stefanos): Is there a way to make them @safe?
         // (The problem is that for LDC, they could take int* or float* pointers
         // but the cast to void16 for DMD is necessary anyway).
-
-        /// Integer ///
-
         void store32i_sse(void *dest, int4 reg)
         {
             version (LDC)
@@ -169,7 +163,6 @@ else
                 storeUnaligned(cast(void16*)(dest+0x10), reg);
             }
         }
-        
         void store16i_sse(void *dest, int4 reg)
         {
             version (LDC)
@@ -181,7 +174,6 @@ else
                 storeUnaligned(cast(void16*)dest, reg);
             }
         }
-        
         // TODO(stefanos): Can we broadcast an int in a float4? That would be useful
         // because then we would use only the float versions.
         void broadcast_int(ref int4 xmm, int v)
@@ -192,7 +184,6 @@ else
             xmm[3] = v;
         }
         const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
-    
         // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
         // than the previous classic switch. BUT. Using the switch had a significant
         // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
@@ -203,11 +194,9 @@ else
             return;
         }
         void *temp = d + n - 0x10;                  // Used for the last 32 bytes
-    
         int4 xmm0;
         // Broadcast v to all bytes.
         broadcast_int(xmm0, v);
-    
         ubyte rem = cast(ulong)d & 15;              // Remainder from the previous 16-byte boundary.
         // Store 16 bytes, from which some will possibly overlap on a future store.
         // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
@@ -216,7 +205,6 @@ else
         store16i_sse(d, xmm0);
         d += 16 - rem;
         n -= 16 - rem;
-    
         // Move in blocks of 32.
         // TODO(stefanos): Experiment with differnt sizes.
         if (n >= 32)
@@ -240,7 +228,7 @@ else
                 // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
                 d += 32;
                 n -= 32;
-            } while(n >= 32);
+            } while (n >= 32);
         }
         // Compensate for the last (at most) 32 bytes.
         store32i_sse(temp-0x10, xmm0);

From ea2ce59af42f31570d958c5beb33576c9f39b5cd Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Thu, 4 Jul 2019 23:06:49 +0300
Subject: [PATCH 03/29] Versioning fix

---
 src/core/experimental/memutils.d | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 758ebc25c4..31259ad07c 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -252,7 +252,7 @@ void Dmemset(T)(ref T dst, const ubyte val)
 {
     import std.traits;
     const uint v = cast(uint)val;
-    version (X86_64)
+    version (D_SIMD)
     {
         static if (isArray!T)
         {

From bac120f468f201103e51b89346515f59fbbe114f Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Thu, 4 Jul 2019 23:17:28 +0300
Subject: [PATCH 04/29] Versioning fix vol. 2

---
 src/core/experimental/memutils.d | 179 ++++++++++++++++---------------
 1 file changed, 91 insertions(+), 88 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 31259ad07c..4a5fda6fba 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -130,108 +130,111 @@ version (GNU)
 }
 else
 {
-    // NOTE(stefanos): I could not a GDC respective of the intrinsics.
-    void Dmemset(void *d, const uint val, size_t n)
+    version (D_SIMD)
     {
-        import core.simd: int4;
-        version (LDC)
-        {
-            import ldc.simd: loadUnaligned, storeUnaligned;
-        }
-        else
-        version (DigitalMars)
-        {
-            import core.simd: void16, loadUnaligned, storeUnaligned;
-        }
-        else
-        {
-            static assert(0, "Only DMD / LDC are supported");
-        }
-        // TODO(stefanos): Is there a way to make them @safe?
-        // (The problem is that for LDC, they could take int* or float* pointers
-        // but the cast to void16 for DMD is necessary anyway).
-        void store32i_sse(void *dest, int4 reg)
+        // NOTE(stefanos): I could not GDC respective intrinsics.
+        void Dmemset(void *d, const uint val, size_t n)
         {
+            import core.simd: int4;
             version (LDC)
             {
-                storeUnaligned!int4(reg, cast(int*)dest);
-                storeUnaligned!int4(reg, cast(int*)(dest+0x10));
+                import ldc.simd: loadUnaligned, storeUnaligned;
             }
             else
+            version (DigitalMars)
             {
-                storeUnaligned(cast(void16*)dest, reg);
-                storeUnaligned(cast(void16*)(dest+0x10), reg);
+                import core.simd: void16, loadUnaligned, storeUnaligned;
             }
-        }
-        void store16i_sse(void *dest, int4 reg)
-        {
-            version (LDC)
+            else
             {
-                storeUnaligned!int4(reg, cast(int*)dest);
+                static assert(0, "Only DMD / LDC are supported");
             }
-            else
+            // TODO(stefanos): Is there a way to make them @safe?
+            // (The problem is that for LDC, they could take int* or float* pointers
+            // but the cast to void16 for DMD is necessary anyway).
+            void store32i_sse(void *dest, int4 reg)
             {
-                storeUnaligned(cast(void16*)dest, reg);
+                version (LDC)
+                {
+                    storeUnaligned!int4(reg, cast(int*)dest);
+                    storeUnaligned!int4(reg, cast(int*)(dest+0x10));
+                }
+                else
+                {
+                    storeUnaligned(cast(void16*)dest, reg);
+                    storeUnaligned(cast(void16*)(dest+0x10), reg);
+                }
             }
-        }
-        // TODO(stefanos): Can we broadcast an int in a float4? That would be useful
-        // because then we would use only the float versions.
-        void broadcast_int(ref int4 xmm, int v)
-        {
-            xmm[0] = v;
-            xmm[1] = v;
-            xmm[2] = v;
-            xmm[3] = v;
-        }
-        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
-        // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
-        // than the previous classic switch. BUT. Using the switch had a significant
-        // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
-        // but the fact that it's more difficult to optimize it as part of the rest of the code.
-        if (n <= 16)
-        {
-            Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n);
-            return;
-        }
-        void *temp = d + n - 0x10;                  // Used for the last 32 bytes
-        int4 xmm0;
-        // Broadcast v to all bytes.
-        broadcast_int(xmm0, v);
-        ubyte rem = cast(ulong)d & 15;              // Remainder from the previous 16-byte boundary.
-        // Store 16 bytes, from which some will possibly overlap on a future store.
-        // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
-        // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
-        // 16, we store 16 bytes anyway.
-        store16i_sse(d, xmm0);
-        d += 16 - rem;
-        n -= 16 - rem;
-        // Move in blocks of 32.
-        // TODO(stefanos): Experiment with differnt sizes.
-        if (n >= 32)
-        {
-            // Align to (previous) multiple of 32. That does something invisible to the code,
-            // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
-            // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
-            // sub RDX, 32;
-            // jge START_OF_THE_LOOP.
-            // Without that, it has to be:
-            // sub RDX, 32;
-            // cmp RDX, 32;
-            // jge START_OF_THE_LOOP
-            // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
-            // we have somehow to compensate for that, which is done at the end of this function.
-            n &= -32;
-            do
+            void store16i_sse(void *dest, int4 reg)
             {
-                store32i_sse(d, xmm0);
-                // NOTE(stefanos): I tried avoiding this operation on `d` by combining
-                // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
-                d += 32;
-                n -= 32;
-            } while (n >= 32);
+                version (LDC)
+                {
+                    storeUnaligned!int4(reg, cast(int*)dest);
+                }
+                else
+                {
+                    storeUnaligned(cast(void16*)dest, reg);
+                }
+            }
+            // TODO(stefanos): Can we broadcast an int in a float4? That would be useful
+            // because then we would use only the float versions.
+            void broadcast_int(ref int4 xmm, int v)
+            {
+                xmm[0] = v;
+                xmm[1] = v;
+                xmm[2] = v;
+                xmm[3] = v;
+            }
+            const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
+            // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
+            // than the previous classic switch. BUT. Using the switch had a significant
+            // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
+            // but the fact that it's more difficult to optimize it as part of the rest of the code.
+            if (n <= 16)
+            {
+                Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n);
+                return;
+            }
+            void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+            int4 xmm0;
+            // Broadcast v to all bytes.
+            broadcast_int(xmm0, v);
+            ubyte rem = cast(ulong)d & 15;              // Remainder from the previous 16-byte boundary.
+            // Store 16 bytes, from which some will possibly overlap on a future store.
+            // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
+            // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
+            // 16, we store 16 bytes anyway.
+            store16i_sse(d, xmm0);
+            d += 16 - rem;
+            n -= 16 - rem;
+            // Move in blocks of 32.
+            // TODO(stefanos): Experiment with differnt sizes.
+            if (n >= 32)
+            {
+                // Align to (previous) multiple of 32. That does something invisible to the code,
+                // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
+                // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
+                // sub RDX, 32;
+                // jge START_OF_THE_LOOP.
+                // Without that, it has to be:
+                // sub RDX, 32;
+                // cmp RDX, 32;
+                // jge START_OF_THE_LOOP
+                // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
+                // we have somehow to compensate for that, which is done at the end of this function.
+                n &= -32;
+                do
+                {
+                    store32i_sse(d, xmm0);
+                    // NOTE(stefanos): I tried avoiding this operation on `d` by combining
+                    // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
+                    d += 32;
+                    n -= 32;
+                } while (n >= 32);
+            }
+            // Compensate for the last (at most) 32 bytes.
+            store32i_sse(temp-0x10, xmm0);
         }
-        // Compensate for the last (at most) 32 bytes.
-        store32i_sse(temp-0x10, xmm0);
     }
 }
 

From ff7e755c54d1d36901351463e45f542c90e0c873 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Thu, 4 Jul 2019 23:42:50 +0300
Subject: [PATCH 05/29] Independency of std.traits

---
 src/core/experimental/memutils.d | 63 +++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 4a5fda6fba..3b47bcfdf3 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -251,9 +251,70 @@ void Dmemset_naive(void *dst, const ubyte val, size_t n)
 // Range-checking is not needed since the user never
 // pass an `n` (byte count) directly.
 
+// Copied from std.traits
+import core.internal.traits: Unqual;
+
+package template ModifyTypePreservingTQ(alias Modifier, T)
+{
+    static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
+    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
+    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
+    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
+    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
+    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
+    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
+    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
+    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
+}
+
+template OriginalType(T)
+{
+    template Impl(T)
+    {
+        static if (is(T U == enum)) alias Impl = OriginalType!U;
+        else                        alias Impl =              T;
+    }
+
+    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
+}
+
+enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
+                               is(T == class) || is(T == interface);
+
+private template AliasThisTypeOf(T)
+if (isAggregateType!T)
+{
+    alias members = __traits(getAliasThis, T);
+
+    static if (members.length == 1)
+    {
+        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
+    }
+    else
+        static assert(0, T.stringof~" does not have alias this type");
+}
+
+template DynamicArrayTypeOf(T)
+{
+    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
+        alias X = DynamicArrayTypeOf!AT;
+    else
+        alias X = OriginalType!T;
+
+    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
+    {
+        alias DynamicArrayTypeOf = X;
+    }
+    else
+        static assert(0, T.stringof~" is not a dynamic array");
+}
+
+enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
+enum bool isStaticArray(T) = __traits(isStaticArray, T);
+enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
+
 void Dmemset(T)(ref T dst, const ubyte val)
 {
-    import std.traits;
     const uint v = cast(uint)val;
     version (D_SIMD)
     {

From 497e53f41b1ac5dd6b491014b54fdad1421b1a98 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 11:54:55 +0300
Subject: [PATCH 06/29] Minor fixes/changes

---
 src/core/experimental/memutils.d | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 3b47bcfdf3..a9df1ac803 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -176,15 +176,6 @@ else
                     storeUnaligned(cast(void16*)dest, reg);
                 }
             }
-            // TODO(stefanos): Can we broadcast an int in a float4? That would be useful
-            // because then we would use only the float versions.
-            void broadcast_int(ref int4 xmm, int v)
-            {
-                xmm[0] = v;
-                xmm[1] = v;
-                xmm[2] = v;
-                xmm[3] = v;
-            }
             const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
             // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
             // than the previous classic switch. BUT. Using the switch had a significant
@@ -196,10 +187,9 @@ else
                 return;
             }
             void *temp = d + n - 0x10;                  // Used for the last 32 bytes
-            int4 xmm0;
             // Broadcast v to all bytes.
-            broadcast_int(xmm0, v);
-            ubyte rem = cast(ulong)d & 15;              // Remainder from the previous 16-byte boundary.
+            auto xmm0 = int4(v);
+            ubyte rem = cast(ubyte)d & 15;              // Remainder from the previous 16-byte boundary.
             // Store 16 bytes, from which some will possibly overlap on a future store.
             // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
             // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
@@ -322,20 +312,10 @@ void Dmemset(T)(ref T dst, const ubyte val)
         {
             size_t n = dst.length * typeof(dst[0]).sizeof;
             Dmemset(dst.ptr, v, n);
-
-            version (unittest)
-            {
-                Dmemset_naive(dst.ptr, v, n);
-            }
         }
         else
         {
             Dmemset(&dst, v, T.sizeof);
-
-            version (unittest)
-            {
-                Dmemset_naive(&dst, v, T.sizeof);
-            }
         }
     }
     else

From c52c099795d885bbc162570a73b4a7a4a9acbd39 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 15:13:38 +0300
Subject: [PATCH 07/29] Style and layout changes

---
 src/core/experimental/memutils.d | 333 +++++++++++++++++--------------
 1 file changed, 183 insertions(+), 150 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index a9df1ac803..00a56d523a 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -4,128 +4,53 @@
  */
 module core.experimental.memutils;
 
-unittest
-{
-    Dmemset_testStaticType!(byte)(5);
-    Dmemset_testStaticType!(ubyte)(5);
-    Dmemset_testStaticType!(short)(5);
-    Dmemset_testStaticType!(ushort)(5);
-    Dmemset_testStaticType!(int)(5);
-    Dmemset_testStaticType!(uint)(5);
-    Dmemset_testStaticType!(long)(5);
-    Dmemset_testStaticType!(ulong)(5);
-    Dmemset_testStaticType!(float)(5);
-    Dmemset_testStaticType!(double)(5);
-    Dmemset_testStaticType!(real)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 3);
-    static foreach (i; 1..10) {
-        Dmemset_testDynamicArray!(ubyte)(5, 2^^i);
-        Dmemset_testStaticArray!(ubyte, 2^^i)(5);
-    }
-    Dmemset_testDynamicArray!(ubyte)(5, 100);
-    Dmemset_testStaticArray!(ubyte, 100)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 500);
-    Dmemset_testStaticArray!(ubyte, 500)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 700);
-    Dmemset_testStaticArray!(ubyte, 700)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 3434);
-    Dmemset_testStaticArray!(ubyte, 3434)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 7128);
-    Dmemset_testStaticArray!(ubyte, 7128)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 13908);
-    Dmemset_testStaticArray!(ubyte, 13908)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 16343);
-    Dmemset_testStaticArray!(ubyte, 16343)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 27897);
-    Dmemset_testStaticArray!(ubyte, 27897)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 32344);
-    Dmemset_testStaticArray!(ubyte, 32344)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 46830);
-    Dmemset_testStaticArray!(ubyte, 46830)(5);
-    Dmemset_testDynamicArray!(ubyte)(5, 64349);
-    Dmemset_testStaticArray!(ubyte, 64349)(5);
-}
+/** Dmemset() implementation */
 
-// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
-void escape(void* p)
-{
-    version (LDC)
-    {
-        import ldc.llvmasm;
-        __asm("", "r,~{memory}", p);
-    }
-    version (GNU)
-    {
-        asm { "" : : "g" p : "memory"; }
-    }
-}
-
-void Dmemset_verifyArray(T)(int j, const ref T[] a, const ubyte v)
-{
-    const ubyte *p = cast(const ubyte *) a.ptr;
-    for (size_t i = 0; i < a.length * T.sizeof; i++)
-    {
-        assert(p[i] == v);
-    }
-}
-
-void Dmemset_verifyStaticType(T)(const ref T t, const ubyte v)
-{
-    const ubyte *p = cast(const ubyte *) &t;
-    for (size_t i = 0; i < T.sizeof; i++)
-    {
-        assert(p[i] == v);
-    }
-}
+/**
+ * NOTE(stefanos):
+ * Range-checking is not needed since the user never
+ * pass an `n` (byte count) directly.
+ */
 
-void Dmemset_testDynamicArray(T)(const ubyte v, size_t n)
+/*
+  If T is an array,set all `dst`'s bytes
+  (whose count is the length of the array times
+  the size of the array element) to `val`.
+  Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+ */
+void Dmemset(T)(ref T dst, const ubyte val)
 {
-    T[] buf;
-    buf.length = n + 32;
-
-    enum alignments = 32;
-    size_t len = n;
-
-    foreach (i; 0..alignments)
+    const uint v = cast(uint) val;
+    version (D_SIMD)
     {
-        auto d = buf[i..i+n];
-
-        escape(d.ptr);
-        Dmemset(d, v);
-        Dmemset_verifyArray(i, d, v);
+        static if (isArray!T)
+        {
+            size_t n = dst.length * typeof(dst[0]).sizeof;
+            Dmemset(dst.ptr, v, n);
+        }
+        else
+        {
+            Dmemset(&dst, v, T.sizeof);
+        }
     }
-}
-
-void Dmemset_testStaticArray(T, size_t n)(const ubyte v)
-{
-    T[n + 32] buf;
-
-    enum alignments = 32;
-    size_t len = n;
-
-    foreach (i; 0..alignments)
+    else
     {
-        auto d = buf[i..i+n];
-
-        escape(d.ptr);
-        Dmemset(d, v);
-        Dmemset_verifyArray(i, d, v);
+        static if (isArray!T)
+        {
+            Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
+        }
+        else
+        {
+            Dmemset_naive(&dst, val, T.sizeof);
+        }
     }
 }
 
-void Dmemset_testStaticType(T)(const ubyte v)
-{
-    T t;
-    escape(&t);
-    Dmemset(t, v);
-    Dmemset_verifyStaticType(t, v);
-}
-
 version (GNU)
 {
     void Dmemset(void *d, const uint val, size_t n)
     {
-        Dmemset_naive(d, cast(const(ubyte))val, n);
+        Dmemset_naive(d, cast(const(ubyte)) val, n);
     }
 }
 else
@@ -135,15 +60,14 @@ else
         // NOTE(stefanos): I could not GDC respective intrinsics.
         void Dmemset(void *d, const uint val, size_t n)
         {
-            import core.simd: int4;
+            import core.simd : int4;
             version (LDC)
             {
-                import ldc.simd: loadUnaligned, storeUnaligned;
+                import ldc.simd : loadUnaligned, storeUnaligned;
             }
-            else
-            version (DigitalMars)
+            else version (DigitalMars)
             {
-                import core.simd: void16, loadUnaligned, storeUnaligned;
+                import core.simd : void16, loadUnaligned, storeUnaligned;
             }
             else
             {
@@ -156,24 +80,24 @@ else
             {
                 version (LDC)
                 {
-                    storeUnaligned!int4(reg, cast(int*)dest);
-                    storeUnaligned!int4(reg, cast(int*)(dest+0x10));
+                    storeUnaligned!int4(reg, cast(int*) dest);
+                    storeUnaligned!int4(reg, cast(int*) (dest+0x10));
                 }
                 else
                 {
-                    storeUnaligned(cast(void16*)dest, reg);
-                    storeUnaligned(cast(void16*)(dest+0x10), reg);
+                    storeUnaligned(cast(void16*) dest, reg);
+                    storeUnaligned(cast(void16*) (dest+0x10), reg);
                 }
             }
             void store16i_sse(void *dest, int4 reg)
             {
                 version (LDC)
                 {
-                    storeUnaligned!int4(reg, cast(int*)dest);
+                    storeUnaligned!int4(reg, cast(int*) dest);
                 }
                 else
                 {
-                    storeUnaligned(cast(void16*)dest, reg);
+                    storeUnaligned(cast(void16*) dest, reg);
                 }
             }
             const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
@@ -183,13 +107,13 @@ else
             // but the fact that it's more difficult to optimize it as part of the rest of the code.
             if (n <= 16)
             {
-                Dmemset_naive(cast(ubyte*)d, cast(ubyte)val, n);
+                Dmemset_naive(cast(ubyte*) d, cast(ubyte) val, n);
                 return;
             }
             void *temp = d + n - 0x10;                  // Used for the last 32 bytes
             // Broadcast v to all bytes.
             auto xmm0 = int4(v);
-            ubyte rem = cast(ubyte)d & 15;              // Remainder from the previous 16-byte boundary.
+            ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
             // Store 16 bytes, from which some will possibly overlap on a future store.
             // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
             // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
@@ -230,19 +154,36 @@ else
 
 void Dmemset_naive(void *dst, const ubyte val, size_t n)
 {
-    ubyte *d = cast(ubyte*)dst;
-    for (size_t i = 0; i != n; ++i)
+    ubyte *d = cast(ubyte*) dst;
+    foreach (i; 0 .. n)
     {
         d[i] = val;
     }
 }
 
-// NOTE(stefanos):
-// Range-checking is not needed since the user never
-// pass an `n` (byte count) directly.
+/** Core features tests.
+  */
+unittest
+{
+    ubyte a[3];
+    Dmemset(a, 7);
+    assert(a[0] == 7);
+    assert(a[1] == 7);
+    assert(a[2] == 7);
+
+    real b;
+    Dmemset(b, 9);
+    ubyte *p = cast(ubyte*) &b;
+    foreach (i; 0 .. b.sizeof)
+    {
+        assert(p[i] == 9);
+    }
+}
 
-// Copied from std.traits
-import core.internal.traits: Unqual;
+
+/** Handy std.traits code, directly copied from there.
+  */
+import core.internal.traits : Unqual;
 
 package template ModifyTypePreservingTQ(alias Modifier, T)
 {
@@ -303,30 +244,122 @@ enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
 enum bool isStaticArray(T) = __traits(isStaticArray, T);
 enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
 
-void Dmemset(T)(ref T dst, const ubyte val)
+
+/** Test suite code
+  */
+unittest
 {
-    const uint v = cast(uint)val;
-    version (D_SIMD)
+    DmemsetTestStaticType!(byte)(5);
+    DmemsetTestStaticType!(ubyte)(5);
+    DmemsetTestStaticType!(short)(5);
+    DmemsetTestStaticType!(ushort)(5);
+    DmemsetTestStaticType!(int)(5);
+    DmemsetTestStaticType!(uint)(5);
+    DmemsetTestStaticType!(long)(5);
+    DmemsetTestStaticType!(ulong)(5);
+    DmemsetTestStaticType!(float)(5);
+    DmemsetTestStaticType!(double)(5);
+    DmemsetTestStaticType!(real)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3);
+    static foreach (i; 1..10) {
+        DmemsetTestDynamicArray!(ubyte)(5, 2^^i);
+        DmemsetTestStaticArray!(ubyte, 2^^i)(5);
+    }
+    DmemsetTestDynamicArray!(ubyte)(5, 100);
+    DmemsetTestStaticArray!(ubyte, 100)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 500);
+    DmemsetTestStaticArray!(ubyte, 500)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 700);
+    DmemsetTestStaticArray!(ubyte, 700)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3434);
+    DmemsetTestStaticArray!(ubyte, 3434)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 7128);
+    DmemsetTestStaticArray!(ubyte, 7128)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 13908);
+    DmemsetTestStaticArray!(ubyte, 13908)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 16343);
+    DmemsetTestStaticArray!(ubyte, 16343)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 27897);
+    DmemsetTestStaticArray!(ubyte, 27897)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 32344);
+    DmemsetTestStaticArray!(ubyte, 32344)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 46830);
+    DmemsetTestStaticArray!(ubyte, 46830)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 64349);
+    DmemsetTestStaticArray!(ubyte, 64349)(5);
+}
+
+// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
+void escape(void* p)
+{
+    version (LDC)
     {
-        static if (isArray!T)
-        {
-            size_t n = dst.length * typeof(dst[0]).sizeof;
-            Dmemset(dst.ptr, v, n);
-        }
-        else
-        {
-            Dmemset(&dst, v, T.sizeof);
-        }
+        import ldc.llvmasm;
+        __asm("", "r,~{memory}", p);
     }
-    else
+    version (GNU)
     {
-        static if (isArray!T)
-        {
-            Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
-        }
-        else
-        {
-            Dmemset_naive(&dst, val, T.sizeof);
-        }
+        asm { "" : : "g" p : "memory"; }
+    }
+}
+
+void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) a.ptr;
+    foreach (i; 0 .. (a.length * T.sizeof))
+    {
+        assert(p[i] == v);
+    }
+}
+
+void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) &t;
+    foreach (i; 0 .. T.sizeof)
+    {
+        assert(p[i] == v);
+    }
+}
+
+void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
+{
+    T[] buf;
+    buf.length = n + 32;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0 .. alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        DmemsetVerifyArray(i, d, v);
+    }
+}
+
+void DmemsetTestStaticArray(T, size_t n)(const ubyte v)
+{
+    T[n + 32] buf;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0..alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        DmemsetVerifyArray(i, d, v);
     }
 }
+
+void DmemsetTestStaticType(T)(const ubyte v)
+{
+    T t;
+    escape(&t);
+    Dmemset(t, v);
+    DmemsetVerifyStaticType(t, v);
+}

From 6ebec4bfb4e13df5d6c2a877aa29a5cc8597f556 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 15:29:15 +0300
Subject: [PATCH 08/29] Moved tests to test folder

---
 src/core/experimental/memutils.d | 122 +------------------------------
 test/experimental/Makefile       |  17 +++++
 test/experimental/src/memutils.d | 118 ++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 121 deletions(-)
 create mode 100644 test/experimental/Makefile
 create mode 100644 test/experimental/src/memutils.d

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 00a56d523a..4a325d1d5c 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -165,7 +165,7 @@ void Dmemset_naive(void *dst, const ubyte val, size_t n)
   */
 unittest
 {
-    ubyte a[3];
+    ubyte[3] a;
     Dmemset(a, 7);
     assert(a[0] == 7);
     assert(a[1] == 7);
@@ -243,123 +243,3 @@ template DynamicArrayTypeOf(T)
 enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
 enum bool isStaticArray(T) = __traits(isStaticArray, T);
 enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
-
-
-/** Test suite code
-  */
-unittest
-{
-    DmemsetTestStaticType!(byte)(5);
-    DmemsetTestStaticType!(ubyte)(5);
-    DmemsetTestStaticType!(short)(5);
-    DmemsetTestStaticType!(ushort)(5);
-    DmemsetTestStaticType!(int)(5);
-    DmemsetTestStaticType!(uint)(5);
-    DmemsetTestStaticType!(long)(5);
-    DmemsetTestStaticType!(ulong)(5);
-    DmemsetTestStaticType!(float)(5);
-    DmemsetTestStaticType!(double)(5);
-    DmemsetTestStaticType!(real)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 3);
-    static foreach (i; 1..10) {
-        DmemsetTestDynamicArray!(ubyte)(5, 2^^i);
-        DmemsetTestStaticArray!(ubyte, 2^^i)(5);
-    }
-    DmemsetTestDynamicArray!(ubyte)(5, 100);
-    DmemsetTestStaticArray!(ubyte, 100)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 500);
-    DmemsetTestStaticArray!(ubyte, 500)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 700);
-    DmemsetTestStaticArray!(ubyte, 700)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 3434);
-    DmemsetTestStaticArray!(ubyte, 3434)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 7128);
-    DmemsetTestStaticArray!(ubyte, 7128)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 13908);
-    DmemsetTestStaticArray!(ubyte, 13908)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 16343);
-    DmemsetTestStaticArray!(ubyte, 16343)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 27897);
-    DmemsetTestStaticArray!(ubyte, 27897)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 32344);
-    DmemsetTestStaticArray!(ubyte, 32344)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 46830);
-    DmemsetTestStaticArray!(ubyte, 46830)(5);
-    DmemsetTestDynamicArray!(ubyte)(5, 64349);
-    DmemsetTestStaticArray!(ubyte, 64349)(5);
-}
-
-// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
-void escape(void* p)
-{
-    version (LDC)
-    {
-        import ldc.llvmasm;
-        __asm("", "r,~{memory}", p);
-    }
-    version (GNU)
-    {
-        asm { "" : : "g" p : "memory"; }
-    }
-}
-
-void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v)
-{
-    const ubyte *p = cast(const ubyte *) a.ptr;
-    foreach (i; 0 .. (a.length * T.sizeof))
-    {
-        assert(p[i] == v);
-    }
-}
-
-void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v)
-{
-    const ubyte *p = cast(const ubyte *) &t;
-    foreach (i; 0 .. T.sizeof)
-    {
-        assert(p[i] == v);
-    }
-}
-
-void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
-{
-    T[] buf;
-    buf.length = n + 32;
-
-    enum alignments = 32;
-    size_t len = n;
-
-    foreach (i; 0 .. alignments)
-    {
-        auto d = buf[i..i+n];
-
-        escape(d.ptr);
-        Dmemset(d, v);
-        DmemsetVerifyArray(i, d, v);
-    }
-}
-
-void DmemsetTestStaticArray(T, size_t n)(const ubyte v)
-{
-    T[n + 32] buf;
-
-    enum alignments = 32;
-    size_t len = n;
-
-    foreach (i; 0..alignments)
-    {
-        auto d = buf[i..i+n];
-
-        escape(d.ptr);
-        Dmemset(d, v);
-        DmemsetVerifyArray(i, d, v);
-    }
-}
-
-void DmemsetTestStaticType(T)(const ubyte v)
-{
-    T t;
-    escape(&t);
-    Dmemset(t, v);
-    DmemsetVerifyStaticType(t, v);
-}
diff --git a/test/experimental/Makefile b/test/experimental/Makefile
new file mode 100644
index 0000000000..2dbbd68aae
--- /dev/null
+++ b/test/experimental/Makefile
@@ -0,0 +1,17 @@
+include ../common.mak
+
+TESTS:=memutils
+
+.PHONY: all clean
+all: $(addprefix $(ROOT)/,$(addsuffix .done,$(TESTS)))
+
+$(ROOT)/%.done: $(ROOT)/%
+	@echo Testing $*
+	$(QUIET)$(TIMELIMIT)$(ROOT)/$* $(RUN_ARGS)
+	@touch $@
+
+$(ROOT)/%: $(SRC)/%.d
+	$(QUIET)$(DMD) $(DFLAGS) -of$@ $<
+
+clean:
+	rm -rf $(ROOT)
diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d
new file mode 100644
index 0000000000..43682baa69
--- /dev/null
+++ b/test/experimental/src/memutils.d
@@ -0,0 +1,118 @@
+import core.experimental.memutils: Dmemset;
+
+void main()
+{
+    DmemsetTestStaticType!(byte)(5);
+    DmemsetTestStaticType!(ubyte)(5);
+    DmemsetTestStaticType!(short)(5);
+    DmemsetTestStaticType!(ushort)(5);
+    DmemsetTestStaticType!(int)(5);
+    DmemsetTestStaticType!(uint)(5);
+    DmemsetTestStaticType!(long)(5);
+    DmemsetTestStaticType!(ulong)(5);
+    DmemsetTestStaticType!(float)(5);
+    DmemsetTestStaticType!(double)(5);
+    DmemsetTestStaticType!(real)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3);
+    static foreach (i; 1..10) {
+        DmemsetTestDynamicArray!(ubyte)(5, 2^^i);
+        DmemsetTestStaticArray!(ubyte, 2^^i)(5);
+    }
+    DmemsetTestDynamicArray!(ubyte)(5, 100);
+    DmemsetTestStaticArray!(ubyte, 100)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 500);
+    DmemsetTestStaticArray!(ubyte, 500)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 700);
+    DmemsetTestStaticArray!(ubyte, 700)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 3434);
+    DmemsetTestStaticArray!(ubyte, 3434)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 7128);
+    DmemsetTestStaticArray!(ubyte, 7128)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 13908);
+    DmemsetTestStaticArray!(ubyte, 13908)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 16343);
+    DmemsetTestStaticArray!(ubyte, 16343)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 27897);
+    DmemsetTestStaticArray!(ubyte, 27897)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 32344);
+    DmemsetTestStaticArray!(ubyte, 32344)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 46830);
+    DmemsetTestStaticArray!(ubyte, 46830)(5);
+    DmemsetTestDynamicArray!(ubyte)(5, 64349);
+    DmemsetTestStaticArray!(ubyte, 64349)(5);
+}
+
+// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
+void escape(void* p)
+{
+    version (LDC)
+    {
+        import ldc.llvmasm;
+        __asm("", "r,~{memory}", p);
+    }
+    version (GNU)
+    {
+        asm { "" : : "g" p : "memory"; }
+    }
+}
+
+void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) a.ptr;
+    foreach (i; 0 .. (a.length * T.sizeof))
+    {
+        assert(p[i] == v);
+    }
+}
+
+void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v)
+{
+    const ubyte *p = cast(const ubyte *) &t;
+    foreach (i; 0 .. T.sizeof)
+    {
+        assert(p[i] == v);
+    }
+}
+
+void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
+{
+    T[] buf;
+    buf.length = n + 32;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0 .. alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        DmemsetVerifyArray(i, d, v);
+    }
+}
+
+void DmemsetTestStaticArray(T, size_t n)(const ubyte v)
+{
+    T[n + 32] buf;
+
+    enum alignments = 32;
+    size_t len = n;
+
+    foreach (i; 0..alignments)
+    {
+        auto d = buf[i..i+n];
+
+        escape(d.ptr);
+        Dmemset(d, v);
+        DmemsetVerifyArray(i, d, v);
+    }
+}
+
+void DmemsetTestStaticType(T)(const ubyte v)
+{
+    T t;
+    escape(&t);
+    Dmemset(t, v);
+    DmemsetVerifyStaticType(t, v);
+}

From 57552eda1392b5b995389c1f559d6482d1241f77 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 15:53:32 +0300
Subject: [PATCH 09/29] More naming and style changes

---
 src/core/experimental/memutils.d | 16 ++++++++--------
 test/experimental/src/memutils.d |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 4a325d1d5c..5a2420ee5e 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -18,7 +18,7 @@ module core.experimental.memutils;
   the size of the array element) to `val`.
   Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
  */
-void Dmemset(T)(ref T dst, const ubyte val)
+void memset(T)(ref T dst, const ubyte val)
 {
     const uint v = cast(uint) val;
     version (D_SIMD)
@@ -37,20 +37,20 @@ void Dmemset(T)(ref T dst, const ubyte val)
     {
         static if (isArray!T)
         {
-            Dmemset_naive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
+            DmemsetNaive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
         }
         else
         {
-            Dmemset_naive(&dst, val, T.sizeof);
+            DmemsetNaive(&dst, val, T.sizeof);
         }
     }
 }
 
 version (GNU)
 {
-    void Dmemset(void *d, const uint val, size_t n)
+    private void Dmemset(void *d, const uint val, size_t n)
     {
-        Dmemset_naive(d, cast(const(ubyte)) val, n);
+        DmemsetNaive(d, cast(const(ubyte)) val, n);
     }
 }
 else
@@ -58,7 +58,7 @@ else
     version (D_SIMD)
     {
         // NOTE(stefanos): I could not GDC respective intrinsics.
-        void Dmemset(void *d, const uint val, size_t n)
+        private void Dmemset(void *d, const uint val, size_t n)
         {
             import core.simd : int4;
             version (LDC)
@@ -107,7 +107,7 @@ else
             // but the fact that it's more difficult to optimize it as part of the rest of the code.
             if (n <= 16)
             {
-                Dmemset_naive(cast(ubyte*) d, cast(ubyte) val, n);
+                DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
                 return;
             }
             void *temp = d + n - 0x10;                  // Used for the last 32 bytes
@@ -152,7 +152,7 @@ else
     }
 }
 
-void Dmemset_naive(void *dst, const ubyte val, size_t n)
+private void DmemsetNaive(void *dst, const ubyte val, size_t n)
 {
     ubyte *d = cast(ubyte*) dst;
     foreach (i; 0 .. n)
diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d
index 43682baa69..8a30fc3217 100644
--- a/test/experimental/src/memutils.d
+++ b/test/experimental/src/memutils.d
@@ -1,4 +1,4 @@
-import core.experimental.memutils: Dmemset;
+import core.experimental.memutils : memset;
 
 void main()
 {
@@ -87,7 +87,7 @@ void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
         auto d = buf[i..i+n];
 
         escape(d.ptr);
-        Dmemset(d, v);
+        memset(d, v);
         DmemsetVerifyArray(i, d, v);
     }
 }
@@ -104,7 +104,7 @@ void DmemsetTestStaticArray(T, size_t n)(const ubyte v)
         auto d = buf[i..i+n];
 
         escape(d.ptr);
-        Dmemset(d, v);
+        memset(d, v);
         DmemsetVerifyArray(i, d, v);
     }
 }
@@ -113,6 +113,6 @@ void DmemsetTestStaticType(T)(const ubyte v)
 {
     T t;
     escape(&t);
-    Dmemset(t, v);
+    memset(t, v);
     DmemsetVerifyStaticType(t, v);
 }

From 60b39670eb8910c580b483bf98dbf1a2e53bd014 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 15:56:56 +0300
Subject: [PATCH 10/29] Minor fix

---
 src/core/experimental/memutils.d | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 5a2420ee5e..c12b6940d6 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -166,13 +166,13 @@ private void DmemsetNaive(void *dst, const ubyte val, size_t n)
 unittest
 {
     ubyte[3] a;
-    Dmemset(a, 7);
+    memset(a, 7);
     assert(a[0] == 7);
     assert(a[1] == 7);
     assert(a[2] == 7);
 
     real b;
-    Dmemset(b, 9);
+    memset(b, 9);
     ubyte *p = cast(ubyte*) &b;
     foreach (i; 0 .. b.sizeof)
     {

From 4faa8f8dad8e2631b5e4a70065e518e4fc523514 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 17:00:55 +0300
Subject: [PATCH 11/29] Versioning improvement

---
 src/core/experimental/memutils.d | 202 +++++++++++++++----------------
 1 file changed, 100 insertions(+), 102 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index c12b6940d6..cffa0d4fdb 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -4,7 +4,7 @@
  */
 module core.experimental.memutils;
 
-/** Dmemset() implementation */
+/** memset() implementation */
 
 /**
  * NOTE(stefanos):
@@ -17,32 +17,19 @@ module core.experimental.memutils;
   (whose count is the length of the array times
   the size of the array element) to `val`.
   Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
- */
+*/
+
 void memset(T)(ref T dst, const ubyte val)
 {
     const uint v = cast(uint) val;
-    version (D_SIMD)
+    static if (isArray!T)
     {
-        static if (isArray!T)
-        {
-            size_t n = dst.length * typeof(dst[0]).sizeof;
-            Dmemset(dst.ptr, v, n);
-        }
-        else
-        {
-            Dmemset(&dst, v, T.sizeof);
-        }
+        size_t n = dst.length * typeof(dst[0]).sizeof;
+        Dmemset(dst.ptr, v, n);
     }
     else
     {
-        static if (isArray!T)
-        {
-            DmemsetNaive(dst.ptr, val, dst.length * typeof(dst[0]).sizeof);
-        }
-        else
-        {
-            DmemsetNaive(&dst, val, T.sizeof);
-        }
+        Dmemset(&dst, v, T.sizeof);
     }
 }
 
@@ -50,109 +37,119 @@ version (GNU)
 {
     private void Dmemset(void *d, const uint val, size_t n)
     {
-        DmemsetNaive(d, cast(const(ubyte)) val, n);
+        memsetNaive(d, val, n);
     }
 }
 else
+version (D_SIMD)
 {
-    version (D_SIMD)
+    /* SIMD implementation
+     */
+    private void Dmemset(void *d, const uint val, size_t n)
     {
-        // NOTE(stefanos): I could not GDC respective intrinsics.
-        private void Dmemset(void *d, const uint val, size_t n)
+        import core.simd : int4;
+        version (LDC)
+        {
+            import ldc.simd : loadUnaligned, storeUnaligned;
+        }
+        else version (DigitalMars)
+        {
+            import core.simd : void16, loadUnaligned, storeUnaligned;
+        }
+        else
+        {
+            static assert(0, "Only DMD / LDC are supported");
+        }
+        // TODO(stefanos): Is there a way to make them @safe?
+        // (The problem is that for LDC, they could take int* or float* pointers
+        // but the cast to void16 for DMD is necessary anyway).
+        void store32i_sse(void *dest, int4 reg)
         {
-            import core.simd : int4;
             version (LDC)
             {
-                import ldc.simd : loadUnaligned, storeUnaligned;
-            }
-            else version (DigitalMars)
-            {
-                import core.simd : void16, loadUnaligned, storeUnaligned;
+                storeUnaligned!int4(reg, cast(int*) dest);
+                storeUnaligned!int4(reg, cast(int*) (dest+0x10));
             }
             else
             {
-                static assert(0, "Only DMD / LDC are supported");
-            }
-            // TODO(stefanos): Is there a way to make them @safe?
-            // (The problem is that for LDC, they could take int* or float* pointers
-            // but the cast to void16 for DMD is necessary anyway).
-            void store32i_sse(void *dest, int4 reg)
-            {
-                version (LDC)
-                {
-                    storeUnaligned!int4(reg, cast(int*) dest);
-                    storeUnaligned!int4(reg, cast(int*) (dest+0x10));
-                }
-                else
-                {
-                    storeUnaligned(cast(void16*) dest, reg);
-                    storeUnaligned(cast(void16*) (dest+0x10), reg);
-                }
+                storeUnaligned(cast(void16*) dest, reg);
+                storeUnaligned(cast(void16*) (dest+0x10), reg);
             }
-            void store16i_sse(void *dest, int4 reg)
+        }
+        void store16i_sse(void *dest, int4 reg)
+        {
+            version (LDC)
             {
-                version (LDC)
-                {
-                    storeUnaligned!int4(reg, cast(int*) dest);
-                }
-                else
-                {
-                    storeUnaligned(cast(void16*) dest, reg);
-                }
+                storeUnaligned!int4(reg, cast(int*) dest);
             }
-            const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
-            // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
-            // than the previous classic switch. BUT. Using the switch had a significant
-            // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
-            // but the fact that it's more difficult to optimize it as part of the rest of the code.
-            if (n <= 16)
+            else
             {
-                DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
-                return;
+                storeUnaligned(cast(void16*) dest, reg);
             }
-            void *temp = d + n - 0x10;                  // Used for the last 32 bytes
-            // Broadcast v to all bytes.
-            auto xmm0 = int4(v);
-            ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
-            // Store 16 bytes, from which some will possibly overlap on a future store.
-            // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
-            // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
-            // 16, we store 16 bytes anyway.
-            store16i_sse(d, xmm0);
-            d += 16 - rem;
-            n -= 16 - rem;
-            // Move in blocks of 32.
-            // TODO(stefanos): Experiment with differnt sizes.
-            if (n >= 32)
+        }
+        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
+        // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
+        // than the previous classic switch. BUT. Using the switch had a significant
+        // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
+        // but the fact that it's more difficult to optimize it as part of the rest of the code.
+        if (n <= 16)
+        {
+            DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
+            return;
+        }
+        void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+        // Broadcast v to all bytes.
+        auto xmm0 = int4(v);
+        ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
+        // Store 16 bytes, from which some will possibly overlap on a future store.
+        // For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
+        // add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
+        // 16, we store 16 bytes anyway.
+        store16i_sse(d, xmm0);
+        d += 16 - rem;
+        n -= 16 - rem;
+        // Move in blocks of 32.
+        // TODO(stefanos): Experiment with differnt sizes.
+        if (n >= 32)
+        {
+            // Align to (previous) multiple of 32. That does something invisible to the code,
+            // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
+            // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
+            // sub RDX, 32;
+            // jge START_OF_THE_LOOP.
+            // Without that, it has to be:
+            // sub RDX, 32;
+            // cmp RDX, 32;
+            // jge START_OF_THE_LOOP
+            // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
+            // we have somehow to compensate for that, which is done at the end of this function.
+            n &= -32;
+            do
             {
-                // Align to (previous) multiple of 32. That does something invisible to the code,
-                // but a good optimizer will avoid a `cmp` instruction inside the loop. With a
-                // multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
-                // sub RDX, 32;
-                // jge START_OF_THE_LOOP.
-                // Without that, it has to be:
-                // sub RDX, 32;
-                // cmp RDX, 32;
-                // jge START_OF_THE_LOOP
-                // NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
-                // we have somehow to compensate for that, which is done at the end of this function.
-                n &= -32;
-                do
-                {
-                    store32i_sse(d, xmm0);
-                    // NOTE(stefanos): I tried avoiding this operation on `d` by combining
-                    // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
-                    d += 32;
-                    n -= 32;
-                } while (n >= 32);
-            }
-            // Compensate for the last (at most) 32 bytes.
-            store32i_sse(temp-0x10, xmm0);
+                store32i_sse(d, xmm0);
+                // NOTE(stefanos): I tried avoiding this operation on `d` by combining
+                // `d` and `n` in the above loop and going backwards. It was slower in my benchs.
+                d += 32;
+                n -= 32;
+            } while (n >= 32);
         }
+        // Compensate for the last (at most) 32 bytes.
+        store32i_sse(temp-0x10, xmm0);
+    }
+
+}
+else
+{
+    private void Dmemset(void *d, const uint val, size_t n)
+    {
+        memsetNaive(d, val, n);
     }
+    
 }
 
-private void DmemsetNaive(void *dst, const ubyte val, size_t n)
+/* Naive implementation
+ */
+private void memsetNaive(void *dst, const ubyte val, size_t n)
 {
     ubyte *d = cast(ubyte*) dst;
     foreach (i; 0 .. n)
@@ -161,6 +158,7 @@ private void DmemsetNaive(void *dst, const ubyte val, size_t n)
     }
 }
 
+
 /** Core features tests.
   */
 unittest

From a161b98ecb83e91389190d73f16e103182b3f3f6 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 17:15:13 +0300
Subject: [PATCH 12/29] Move std.traits code to core.internal.traits

---
 src/core/experimental/memutils.d |  66 +---------------
 src/core/internal/traits.d       | 125 +++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+), 65 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index cffa0d4fdb..52cea7f555 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -21,6 +21,7 @@ module core.experimental.memutils;
 
 void memset(T)(ref T dst, const ubyte val)
 {
+    import core.internal.traits : isArray;
     const uint v = cast(uint) val;
     static if (isArray!T)
     {
@@ -144,7 +145,6 @@ else
     {
         memsetNaive(d, val, n);
     }
-    
 }
 
 /* Naive implementation
@@ -177,67 +177,3 @@ unittest
         assert(p[i] == 9);
     }
 }
-
-
-/** Handy std.traits code, directly copied from there.
-  */
-import core.internal.traits : Unqual;
-
-package template ModifyTypePreservingTQ(alias Modifier, T)
-{
-    static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
-    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
-    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
-    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
-    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
-    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
-    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
-    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
-    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
-}
-
-template OriginalType(T)
-{
-    template Impl(T)
-    {
-        static if (is(T U == enum)) alias Impl = OriginalType!U;
-        else                        alias Impl =              T;
-    }
-
-    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
-}
-
-enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
-                               is(T == class) || is(T == interface);
-
-private template AliasThisTypeOf(T)
-if (isAggregateType!T)
-{
-    alias members = __traits(getAliasThis, T);
-
-    static if (members.length == 1)
-    {
-        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
-    }
-    else
-        static assert(0, T.stringof~" does not have alias this type");
-}
-
-template DynamicArrayTypeOf(T)
-{
-    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
-        alias X = DynamicArrayTypeOf!AT;
-    else
-        alias X = OriginalType!T;
-
-    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
-    {
-        alias DynamicArrayTypeOf = X;
-    }
-    else
-        static assert(0, T.stringof~" is not a dynamic array");
-}
-
-enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
-enum bool isStaticArray(T) = __traits(isStaticArray, T);
-enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
index bccf1ad356..089346e251 100644
--- a/src/core/internal/traits.d
+++ b/src/core/internal/traits.d
@@ -567,3 +567,128 @@ if (func.length == 1 /*&& isCallable!func*/)
     static assert(P_dglit.length == 1);
     static assert(is(P_dglit[0] == int));
 }
+
+// [For internal use]
+package template ModifyTypePreservingTQ(alias Modifier, T)
+{
+         static if (is(T U ==          immutable U)) alias ModifyTypePreservingTQ =          immutable Modifier!U;
+    else static if (is(T U == shared inout const U)) alias ModifyTypePreservingTQ = shared inout const Modifier!U;
+    else static if (is(T U == shared inout       U)) alias ModifyTypePreservingTQ = shared inout       Modifier!U;
+    else static if (is(T U == shared       const U)) alias ModifyTypePreservingTQ = shared       const Modifier!U;
+    else static if (is(T U == shared             U)) alias ModifyTypePreservingTQ = shared             Modifier!U;
+    else static if (is(T U ==        inout const U)) alias ModifyTypePreservingTQ =        inout const Modifier!U;
+    else static if (is(T U ==        inout       U)) alias ModifyTypePreservingTQ =              inout Modifier!U;
+    else static if (is(T U ==              const U)) alias ModifyTypePreservingTQ =              const Modifier!U;
+    else                                             alias ModifyTypePreservingTQ =                    Modifier!T;
+}
+
+@safe unittest
+{
+    alias Intify(T) = int;
+    static assert(is(ModifyTypePreservingTQ!(Intify,                    real) ==                    int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,              const real) ==              const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout       real) ==        inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,        inout const real) ==        inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared             real) == shared             int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared       const real) == shared       const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout       real) == shared inout       int));
+    static assert(is(ModifyTypePreservingTQ!(Intify, shared inout const real) == shared inout const int));
+    static assert(is(ModifyTypePreservingTQ!(Intify,          immutable real) ==          immutable int));
+}
+
+/**
+ * Strips off all `enum`s from type `T`.
+ */
+template OriginalType(T)
+{
+    template Impl(T)
+    {
+        static if (is(T U == enum)) alias Impl = OriginalType!U;
+        else                        alias Impl =              T;
+    }
+
+    alias OriginalType = ModifyTypePreservingTQ!(Impl, T);
+}
+
+///
+@safe unittest
+{
+    enum E : real { a = 0 } // NOTE: explicit initialization to 0 required during Enum init deprecation cycle
+    enum F : E    { a = E.a }
+    alias G = const(F);
+    static assert(is(OriginalType!E == real));
+    static assert(is(OriginalType!F == real));
+    static assert(is(OriginalType!G == const real));
+}
+
+/**
+ * Detect whether type `T` is an aggregate type.
+ */
+enum bool isAggregateType(T) = is(T == struct) || is(T == union) ||
+                               is(T == class) || is(T == interface);
+
+private template AliasThisTypeOf(T)
+if (isAggregateType!T)
+{
+    alias members = __traits(getAliasThis, T);
+
+    static if (members.length == 1)
+    {
+        alias AliasThisTypeOf = typeof(__traits(getMember, T.init, members[0]));
+    }
+    else
+        static assert(0, T.stringof~" does not have alias this type");
+}
+
+/*
+ */
+template DynamicArrayTypeOf(T)
+{
+    static if (is(AliasThisTypeOf!T AT) && !is(AT[] == AT))
+        alias X = DynamicArrayTypeOf!AT;
+    else
+        alias X = OriginalType!T;
+
+    static if (is(Unqual!X : E[], E) && !is(typeof({ enum n = X.length; })))
+    {
+        alias DynamicArrayTypeOf = X;
+    }
+    else
+        static assert(0, T.stringof~" is not a dynamic array");
+}
+
+@safe unittest
+{
+    static foreach (T; AliasSeq!(/*void, */bool, NumericTypeList, /*ImaginaryTypeList, ComplexTypeList*/))
+        static foreach (Q; AliasSeq!(TypeQualifierList, InoutOf, SharedInoutOf))
+        {
+            static assert(is( Q!T[]  == DynamicArrayTypeOf!( Q!T[] ) ));
+            static assert(is( Q!(T[])  == DynamicArrayTypeOf!( Q!(T[]) ) ));
+
+            static foreach (P; AliasSeq!(MutableOf, ConstOf, ImmutableOf))
+            {
+                static assert(is( Q!(P!T[]) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!T[])) ) ));
+                static assert(is( Q!(P!(T[])) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!(T[]))) ) ));
+            }
+        }
+
+    static assert(!is(DynamicArrayTypeOf!(int[3])));
+    static assert(!is(DynamicArrayTypeOf!(void[3])));
+    static assert(!is(DynamicArrayTypeOf!(typeof(null))));
+}
+
+/**
+ * Detect whether type `T` is a dynamic array.
+ */
+enum bool isDynamicArray(T) = is(DynamicArrayTypeOf!T) && !isAggregateType!T;
+
+/**
+ * Detect whether type `T` is an array (static or dynamic; for associative
+ *  arrays see $(LREF isAssociativeArray)).
+ */
+enum bool isArray(T) = isStaticArray!T || isDynamicArray!T;
+
+/**
+ * Detect whether type `T` is a static array.
+ */
+enum bool isStaticArray(T) = __traits(isStaticArray, T);

From 5da39a9345bf90f28060d5982581ff066e4d2e69 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 17:17:20 +0300
Subject: [PATCH 13/29] Naming fix

---
 src/core/experimental/memutils.d | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 52cea7f555..ab83e4f2b9 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -95,7 +95,7 @@ version (D_SIMD)
         // but the fact that it's more difficult to optimize it as part of the rest of the code.
         if (n <= 16)
         {
-            DmemsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
+            memsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
             return;
         }
         void *temp = d + n - 0x10;                  // Used for the last 32 bytes

From cc6d019d3f72bdb467b5671d9642fa8c2bef53a2 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 17:22:04 +0300
Subject: [PATCH 14/29] Fix in using non-existent code in internal.traits
 unittests

---
 src/core/internal/traits.d | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/core/internal/traits.d b/src/core/internal/traits.d
index 089346e251..aa331590ac 100644
--- a/src/core/internal/traits.d
+++ b/src/core/internal/traits.d
@@ -657,21 +657,10 @@ template DynamicArrayTypeOf(T)
         static assert(0, T.stringof~" is not a dynamic array");
 }
 
+// TODO(stefanos): More unit-testing.
+
 @safe unittest
 {
-    static foreach (T; AliasSeq!(/*void, */bool, NumericTypeList, /*ImaginaryTypeList, ComplexTypeList*/))
-        static foreach (Q; AliasSeq!(TypeQualifierList, InoutOf, SharedInoutOf))
-        {
-            static assert(is( Q!T[]  == DynamicArrayTypeOf!( Q!T[] ) ));
-            static assert(is( Q!(T[])  == DynamicArrayTypeOf!( Q!(T[]) ) ));
-
-            static foreach (P; AliasSeq!(MutableOf, ConstOf, ImmutableOf))
-            {
-                static assert(is( Q!(P!T[]) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!T[])) ) ));
-                static assert(is( Q!(P!(T[])) == DynamicArrayTypeOf!( Q!(SubTypeOf!(P!(T[]))) ) ));
-            }
-        }
-
     static assert(!is(DynamicArrayTypeOf!(int[3])));
     static assert(!is(DynamicArrayTypeOf!(void[3])));
     static assert(!is(DynamicArrayTypeOf!(typeof(null))));

From 7b9eb3c9dcffef2956446bf47abd5661501e8657 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Fri, 5 Jul 2019 17:29:03 +0300
Subject: [PATCH 15/29] Fix for uint vs ubyte in memsetNaive

---
 src/core/experimental/memutils.d | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index ab83e4f2b9..4fe44cbd1f 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -149,12 +149,12 @@ else
 
 /* Naive implementation
  */
-private void memsetNaive(void *dst, const ubyte val, size_t n)
+private void memsetNaive(void *dst, const uint val, size_t n)
 {
     ubyte *d = cast(ubyte*) dst;
     foreach (i; 0 .. n)
     {
-        d[i] = val;
+        d[i] = cast(ubyte)val;
     }
 }
 

From 08d044ff7055bcfcad18c219abdbc93a11022dd7 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sat, 6 Jul 2019 14:27:36 +0300
Subject: [PATCH 16/29] Removed escaping from tests in memutils

---
 test/experimental/src/memutils.d | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/test/experimental/src/memutils.d b/test/experimental/src/memutils.d
index 8a30fc3217..f9ed626c67 100644
--- a/test/experimental/src/memutils.d
+++ b/test/experimental/src/memutils.d
@@ -42,20 +42,6 @@ void main()
     DmemsetTestStaticArray!(ubyte, 64349)(5);
 }
 
-// From a very good Chandler Carruth video on benchmarking: https://www.youtube.com/watch?v=nXaxk27zwlk
-void escape(void* p)
-{
-    version (LDC)
-    {
-        import ldc.llvmasm;
-        __asm("", "r,~{memory}", p);
-    }
-    version (GNU)
-    {
-        asm { "" : : "g" p : "memory"; }
-    }
-}
-
 void DmemsetVerifyArray(T)(int j, const ref T[] a, const ubyte v)
 {
     const ubyte *p = cast(const ubyte *) a.ptr;
@@ -74,6 +60,10 @@ void DmemsetVerifyStaticType(T)(const ref T t, const ubyte v)
     }
 }
 
+// NOTE(stefanos): Escaping the pointers is not needed, the compiler doesn't optimize it away.
+// My best guess is that this is because of the verification (i.e. if the operation is not done,
+// an assert will fire and does not satisfy correctness).
+
 void DmemsetTestDynamicArray(T)(const ubyte v, size_t n)
 {
     T[] buf;

From d611a186485b6cebf04b8657aefaf82e3e772b85 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sun, 7 Jul 2019 18:40:27 +0300
Subject: [PATCH 17/29] Versioning improvement

---
 src/core/experimental/memutils.d | 41 ++++++++++++++------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 4fe44cbd1f..00a5058d3f 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -34,18 +34,22 @@ void memset(T)(ref T dst, const ubyte val)
     }
 }
 
-version (GNU)
+version (D_SIMD)
 {
-    private void Dmemset(void *d, const uint val, size_t n)
-    {
-        memsetNaive(d, val, n);
-    }
+    version = useSIMD;
 }
-else
-version (D_SIMD)
+version (LDC)
+{
+    // LDC always supports SIMD and the back-end uses the most
+    // appropriate size for every target.
+    version = useSIMD;
+}
+
+version (useSIMD)
 {
     /* SIMD implementation
      */
+    //pragma(msg, "SIMD used");
     private void Dmemset(void *d, const uint val, size_t n)
     {
         import core.simd : int4;
@@ -64,30 +68,23 @@ version (D_SIMD)
         // TODO(stefanos): Is there a way to make them @safe?
         // (The problem is that for LDC, they could take int* or float* pointers
         // but the cast to void16 for DMD is necessary anyway).
-        void store32i_sse(void *dest, int4 reg)
+        void store16i_sse(void *dest, int4 reg)
         {
             version (LDC)
             {
                 storeUnaligned!int4(reg, cast(int*) dest);
-                storeUnaligned!int4(reg, cast(int*) (dest+0x10));
             }
             else
             {
                 storeUnaligned(cast(void16*) dest, reg);
-                storeUnaligned(cast(void16*) (dest+0x10), reg);
             }
         }
-        void store16i_sse(void *dest, int4 reg)
+        void store32i_sse(void *dest, int4 reg)
         {
-            version (LDC)
-            {
-                storeUnaligned!int4(reg, cast(int*) dest);
-            }
-            else
-            {
-                storeUnaligned(cast(void16*) dest, reg);
-            }
+            store16i_sse(dest, reg);
+            store16i_sse(dest+0x10, reg);
         }
+
         const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
         // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
         // than the previous classic switch. BUT. Using the switch had a significant
@@ -137,18 +134,17 @@ version (D_SIMD)
         // Compensate for the last (at most) 32 bytes.
         store32i_sse(temp-0x10, xmm0);
     }
-
 }
 else
 {
+    /* Forward to simple implementation.
+     */
     private void Dmemset(void *d, const uint val, size_t n)
     {
         memsetNaive(d, val, n);
     }
 }
 
-/* Naive implementation
- */
 private void memsetNaive(void *dst, const uint val, size_t n)
 {
     ubyte *d = cast(ubyte*) dst;
@@ -158,7 +154,6 @@ private void memsetNaive(void *dst, const uint val, size_t n)
     }
 }
 
-
 /** Core features tests.
   */
 unittest

From 00ca80a4ae50aa2d25dd6a86b6cbf0242140c29c Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sun, 7 Jul 2019 21:29:09 +0300
Subject: [PATCH 18/29] GDC SIMD version and bug fix

---
 src/core/experimental/memutils.d | 86 +++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 00a5058d3f..da1b6aa9e9 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -13,15 +13,16 @@ module core.experimental.memutils;
  */
 
 /*
-  If T is an array,set all `dst`'s bytes
+  If T is an array, set all `dst`'s bytes
   (whose count is the length of the array times
   the size of the array element) to `val`.
   Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
 */
-
-void memset(T)(ref T dst, const ubyte val)
+// This is named Dmemset (contrary to the D runtime 
+// PR where it's named memset()) for clear disambiguation with the libc memset().
+void Dmemset(T)(ref T dst, const ubyte val)
 {
-    import core.internal.traits : isArray;
+    import std.traits : isArray;
     const uint v = cast(uint) val;
     static if (isArray!T)
     {
@@ -38,10 +39,16 @@ version (D_SIMD)
 {
     version = useSIMD;
 }
-version (LDC)
+else version (LDC)
+{
+    // LDC always supports SIMD (but doesn't ever set D_SIMD) and
+    // the back-end uses the most appropriate size for every target.
+    version = useSIMD;
+}
+else version (GNU)
 {
-    // LDC always supports SIMD and the back-end uses the most
-    // appropriate size for every target.
+    // GNU does not support SIMD by default. We have to do more complicated
+    // stuff below. So we start by default with useSIMD and decide later.
     version = useSIMD;
 }
 
@@ -50,34 +57,75 @@ version (useSIMD)
     /* SIMD implementation
      */
     //pragma(msg, "SIMD used");
-    private void Dmemset(void *d, const uint val, size_t n)
+    extern(C) private void Dmemset(void *d, const uint val, size_t n)
     {
         import core.simd : int4;
         version (LDC)
         {
+            enum gdcSIMD = false;
             import ldc.simd : loadUnaligned, storeUnaligned;
         }
         else version (DigitalMars)
         {
             import core.simd : void16, loadUnaligned, storeUnaligned;
         }
-        else
+        else version (GNU)
         {
-            static assert(0, "Only DMD / LDC are supported");
+            // NOTE(stefanos): I could not combine GDC versioning in `useSIMD`.
+            // To know if we can use SIMD for GDC is more complex. We need to:
+            // - Be in x86 arch since the intrinsics (builtins) are only x86 specific.
+            // - Compile the int4 vector size.
+            // TODO(stefanos): The GCC specification points that to use the store intrinsic,
+            // we have to be in SSE2. Is this guaranteed if `int4` compiles?
+            // Note that GCC builtins provide the __builtin_cpu_supports() but this is a runtime
+            // function.
+            version (X86_64)
+            {
+                enum isX86 = true;
+            }
+            else version (X86)
+            {
+                enum isX86 = true;
+            }
+
+            static if (isX86 && __traits(compiles, int4))
+            {
+                enum gdcSIMD = true;
+            }
+            else
+            {
+                memsetNaive(d, val, n);
+                return;
+            }
         }
+
         // TODO(stefanos): Is there a way to make them @safe?
         // (The problem is that for LDC, they could take int* or float* pointers
         // but the cast to void16 for DMD is necessary anyway).
-        void store16i_sse(void *dest, int4 reg)
+
+        static if (gdcSIMD)
         {
-            version (LDC)
+            import gcc.builtins;
+            import core.simd : ubyte16;
+            void store16i_sse(void *dest, int4 reg)
             {
-                storeUnaligned!int4(reg, cast(int*) dest);
+                __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
             }
-            else
+        }
+        else
+        {
+            void store16i_sse(void *dest, int4 reg)
             {
-                storeUnaligned(cast(void16*) dest, reg);
+                version (LDC)
+                {
+                    storeUnaligned!int4(reg, cast(int*) dest);
+                }
+                else
+                {
+                    storeUnaligned(cast(void16*) dest, reg);
+                }
             }
+
         }
         void store32i_sse(void *dest, int4 reg)
         {
@@ -85,17 +133,17 @@ version (useSIMD)
             store16i_sse(dest+0x10, reg);
         }
 
-        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
         // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
         // than the previous classic switch. BUT. Using the switch had a significant
         // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
         // but the fact that it's more difficult to optimize it as part of the rest of the code.
-        if (n <= 16)
+        if (n < 32)
         {
             memsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
             return;
         }
         void *temp = d + n - 0x10;                  // Used for the last 32 bytes
+        const uint v = val * 0x01010101;            // Broadcast c to all 4 bytes
         // Broadcast v to all bytes.
         auto xmm0 = int4(v);
         ubyte rem = cast(ubyte) d & 15;              // Remainder from the previous 16-byte boundary.
@@ -159,13 +207,13 @@ private void memsetNaive(void *dst, const uint val, size_t n)
 unittest
 {
     ubyte[3] a;
-    memset(a, 7);
+    Dmemset(a, 7);
     assert(a[0] == 7);
     assert(a[1] == 7);
     assert(a[2] == 7);
 
     real b;
-    memset(b, 9);
+    Dmemset(b, 9);
     ubyte *p = cast(ubyte*) &b;
     foreach (i; 0 .. b.sizeof)
     {

From 9ad8f16ecb4a565a0bfb329de63aba7d50067d0e Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sun, 7 Jul 2019 22:15:13 +0300
Subject: [PATCH 19/29] Not so naive version of memsetNaive

---
 src/core/experimental/memutils.d | 75 +++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 10 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index da1b6aa9e9..d117dd25a4 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -18,11 +18,9 @@ module core.experimental.memutils;
   the size of the array element) to `val`.
   Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
 */
-// This is named Dmemset (contrary to the D runtime 
-// PR where it's named memset()) for clear disambiguation with the libc memset().
-void Dmemset(T)(ref T dst, const ubyte val)
+void memset(T)(ref T dst, const ubyte val)
 {
-    import std.traits : isArray;
+    import core.internal.traits : isArray;
     const uint v = cast(uint) val;
     static if (isArray!T)
     {
@@ -35,6 +33,7 @@ void Dmemset(T)(ref T dst, const ubyte val)
     }
 }
 
+
 version (D_SIMD)
 {
     version = useSIMD;
@@ -67,6 +66,7 @@ version (useSIMD)
         }
         else version (DigitalMars)
         {
+            enum gdcSIMD = false;
             import core.simd : void16, loadUnaligned, storeUnaligned;
         }
         else version (GNU)
@@ -139,7 +139,7 @@ version (useSIMD)
         // but the fact that it's more difficult to optimize it as part of the rest of the code.
         if (n < 32)
         {
-            memsetNaive(cast(ubyte*) d, cast(ubyte) val, n);
+            memsetNaive(d, val, n);
             return;
         }
         void *temp = d + n - 0x10;                  // Used for the last 32 bytes
@@ -193,13 +193,68 @@ else
     }
 }
 
+// NOTE(stefanos): We're using naive for the < 32 case in the SIMD version.
+// To be more performant, for that case, we would have a big fall-through switch
+// for all < 32 sizes.
 private void memsetNaive(void *dst, const uint val, size_t n)
 {
-    ubyte *d = cast(ubyte*) dst;
-    foreach (i; 0 .. n)
+    const ulong v = cast(ulong) val * 0x0101010101010101;  // Broadcast val to all 8 bytes
+    enum handleLT16Sizes = "
+    switch (n)
+    {
+        case 6:
+            *(cast(uint*) (dst+2)) = cast(uint) v;
+            goto case 2;  // fall-through
+        case 2:
+            *(cast(ushort*) dst) = cast(ushort) v;
+            return;
+
+        case 7:
+            *(cast(uint*) (dst+3)) = cast(uint) v;
+            goto case 3;  // fall-through
+        case 3:
+            *(cast(ushort*) (dst+1)) = cast(ushort) v;
+            goto case 1;  // fall-through
+        case 1:
+            *(cast(ubyte*) dst) = cast(ubyte) v;
+            return;
+
+        case 4:
+            *(cast(uint*) dst) = cast(uint) v;
+            return;
+        case 0:
+            return;
+
+        case 5:
+            *(cast(uint*) (dst+1)) = cast(uint) v;
+            *(cast(ubyte*) dst) = cast(ubyte) v;
+            return;
+        default:
+    }
+    ";
+    mixin(handleLT16Sizes);
+    // NOTE(stefanos): Normally, we would have different alignment
+    // for 32-bit and 64-bit versions. For the sake of simplicity,
+    // we'll let the compiler do the work.
+    ubyte rem = cast(ubyte) dst & 7;
+    if (rem)
+    {  // Unaligned
+        // Move 8 bytes (which we will possibly overlap later).
+        *(cast(ulong*) dst) = v;
+        // Reach alignment
+        dst += 8 - rem;
+        n -= 8 - rem;
+    }
+    ulong *d = cast(ulong*) dst;
+    ulong temp = n / 8;
+    for (size_t i = 0; i != temp; ++i)
     {
-        d[i] = cast(ubyte)val;
+        *d = v;
+        ++d;  // += 8
+        n -= 8;
     }
+    dst = cast(void *) d;
+    mixin(handleLT16Sizes);
 }
 
 /** Core features tests.
@@ -207,13 +262,13 @@ private void memsetNaive(void *dst, const uint val, size_t n)
 unittest
 {
     ubyte[3] a;
-    Dmemset(a, 7);
+    memset(a, 7);
     assert(a[0] == 7);
     assert(a[1] == 7);
     assert(a[2] == 7);
 
     real b;
-    Dmemset(b, 9);
+    memset(b, 9);
     ubyte *p = cast(ubyte*) &b;
     foreach (i; 0 .. b.sizeof)
     {

From 504fc7bbe80dbff527e783d148adb1153839689e Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Mon, 29 Jul 2019 23:05:11 +0300
Subject: [PATCH 20/29] mixin removal in memsetNaive

---
 src/core/experimental/memutils.d | 81 ++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index d117dd25a4..17d5d66c02 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -33,7 +33,6 @@ void memset(T)(ref T dst, const ubyte val)
     }
 }
 
-
 version (D_SIMD)
 {
     version = useSIMD;
@@ -56,7 +55,7 @@ version (useSIMD)
     /* SIMD implementation
      */
     //pragma(msg, "SIMD used");
-    extern(C) private void Dmemset(void *d, const uint val, size_t n)
+    private void Dmemset(void *d, const uint val, size_t n)
     {
         import core.simd : int4;
         version (LDC)
@@ -198,41 +197,50 @@ else
 // for all < 32 sizes.
 private void memsetNaive(void *dst, const uint val, size_t n)
 {
-    const ulong v = cast(ulong) val * 0x0101010101010101;  // Broadcast val to all 8 bytes
-    enum handleLT16Sizes = "
-    switch (n)
+    // NOTE(stefanos): DMD could not inline it.
+    void handleLT16Sizes(void *d, const ulong v, size_t n)
     {
-        case 6:
-            *(cast(uint*) (dst+2)) = cast(uint) v;
-            goto case 2;  // fall-through
-        case 2:
-            *(cast(ushort*) dst) = cast(ushort) v;
-            return;
+        switch (n)
+        {
+            case 6:
+                *(cast(uint*) (d+2)) = cast(uint) v;
+                goto case 2;  // fall-through
+            case 2:
+                *(cast(ushort*) d) = cast(ushort) v;
+                return;
 
-        case 7:
-            *(cast(uint*) (dst+3)) = cast(uint) v;
-            goto case 3;  // fall-through
-        case 3:
-            *(cast(ushort*) (dst+1)) = cast(ushort) v;
-            goto case 1;  // fall-through
-        case 1:
-            *(cast(ubyte*) dst) = cast(ubyte) v;
-            return;
+            case 7:
+                *(cast(uint*) (d+3)) = cast(uint) v;
+                goto case 3;  // fall-through
+            case 3:
+                *(cast(ushort*) (d+1)) = cast(ushort) v;
+                goto case 1;  // fall-through
+            case 1:
+                *(cast(ubyte*) d) = cast(ubyte) v;
+                return;
 
-        case 4:
-            *(cast(uint*) dst) = cast(uint) v;
-            return;
-        case 0:
-            return;
+            case 4:
+                *(cast(uint*) d) = cast(uint) v;
+                return;
+            case 0:
+                return;
 
-        case 5:
-            *(cast(uint*) (dst+1)) = cast(uint) v;
-            *(cast(ubyte*) dst) = cast(ubyte) v;
-            return;
-        default:
+            case 5:
+                *(cast(uint*) (d+1)) = cast(uint) v;
+                *(cast(ubyte*) d) = cast(ubyte) v;
+                return;
+            default:
+        }
+    }
+
+
+    const ulong v = cast(ulong) val * 0x0101010101010101;  // Broadcast c to all 8 bytes
+    if (n < 8)
+    {
+        handleLT16Sizes(dst, v, n);
+        return;
     }
-    ";
-    mixin(handleLT16Sizes);
+    
     // NOTE(stefanos): Normally, we would have different alignment
     // for 32-bit and 64-bit versions. For the sake of simplicity,
     // we'll let the compiler do the work.
@@ -241,22 +249,23 @@ private void memsetNaive(void *dst, const uint val, size_t n)
     {  // Unaligned
         // Move 8 bytes (which we will possibly overlap later).
         *(cast(ulong*) dst) = v;
-        // Reach alignment
         dst += 8 - rem;
         n -= 8 - rem;
     }
     ulong *d = cast(ulong*) dst;
     ulong temp = n / 8;
-    for (size_t i = 0; i != temp; ++i)
+    for(size_t i = 0; i != temp; ++i)
     {
         *d = v;
-        ++d;  // += 8
+        ++d;
         n -= 8;
     }
     dst = cast(void *) d;
-    mixin(handleLT16Sizes);
+
+    handleLT16Sizes(dst, v, n);
 }
 
+
 /** Core features tests.
   */
 unittest

From 9af240fb7821a51259480008e75f11b2258b55c1 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Mon, 29 Jul 2019 23:07:50 +0300
Subject: [PATCH 21/29] Style fix

---
 src/core/experimental/memutils.d | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 17d5d66c02..3f49e6ffb4 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -240,7 +240,6 @@ private void memsetNaive(void *dst, const uint val, size_t n)
         handleLT16Sizes(dst, v, n);
         return;
     }
-    
     // NOTE(stefanos): Normally, we would have different alignment
     // for 32-bit and 64-bit versions. For the sake of simplicity,
     // we'll let the compiler do the work.
@@ -254,7 +253,7 @@ private void memsetNaive(void *dst, const uint val, size_t n)
     }
     ulong *d = cast(ulong*) dst;
     ulong temp = n / 8;
-    for(size_t i = 0; i != temp; ++i)
+    for (size_t i = 0; i != temp; ++i)
     {
         *d = v;
         ++d;

From 08ffa2c56c1aa1f90d533973e645be8106dd2823 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 12:57:47 +0300
Subject: [PATCH 22/29] Doc fix

---
 src/core/experimental/memutils.d | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 3f49e6ffb4..17267a93b9 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -4,14 +4,6 @@
  */
 module core.experimental.memutils;
 
-/** memset() implementation */
-
-/**
- * NOTE(stefanos):
- * Range-checking is not needed since the user never
- * pass an `n` (byte count) directly.
- */
-
 /*
   If T is an array, set all `dst`'s bytes
   (whose count is the length of the array times
@@ -54,7 +46,6 @@ version (useSIMD)
 {
     /* SIMD implementation
      */
-    //pragma(msg, "SIMD used");
     private void Dmemset(void *d, const uint val, size_t n)
     {
         import core.simd : int4;
@@ -154,7 +145,6 @@ version (useSIMD)
         d += 16 - rem;
         n -= 16 - rem;
         // Move in blocks of 32.
-        // TODO(stefanos): Experiment with differnt sizes.
         if (n >= 32)
         {
             // Align to (previous) multiple of 32. That does something invisible to the code,
@@ -192,9 +182,9 @@ else
     }
 }
 
-// NOTE(stefanos): We're using naive for the < 32 case in the SIMD version.
-// To be more performant, for that case, we would have a big fall-through switch
-// for all < 32 sizes.
+/*
+  Naive version for when there isn't any vector support (SIMD etc.).
+*/
 private void memsetNaive(void *dst, const uint val, size_t n)
 {
     // NOTE(stefanos): DMD could not inline it.
@@ -253,6 +243,7 @@ private void memsetNaive(void *dst, const uint val, size_t n)
     }
     ulong *d = cast(ulong*) dst;
     ulong temp = n / 8;
+    // Go in steps of 8 - the register size in x86_64.
     for (size_t i = 0; i != temp; ++i)
     {
         *d = v;

From ff8121929f14c4d875276b989e70152bf8009f78 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 13:04:29 +0300
Subject: [PATCH 23/29] SIMD versioning improvement

---
 src/core/experimental/memutils.d | 85 +++++++++++---------------------
 1 file changed, 30 insertions(+), 55 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 17267a93b9..64e185074b 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -27,19 +27,37 @@ void memset(T)(ref T dst, const ubyte val)
 
 version (D_SIMD)
 {
-    version = useSIMD;
+    import core.simd : float4;
+    enum useSIMD = true;
 }
 else version (LDC)
 {
     // LDC always supports SIMD (but doesn't ever set D_SIMD) and
     // the back-end uses the most appropriate size for every target.
-    version = useSIMD;
+    import core.simd : float4;
+    enum useSIMD = true;
 }
 else version (GNU)
 {
-    // GNU does not support SIMD by default. We have to do more complicated
-    // stuff below. So we start by default with useSIMD and decide later.
-    version = useSIMD;
+    import core.simd : float4;
+    // GNU does not support SIMD by default.
+    version (X86_64)
+    {
+        private enum isX86 = true;
+    }
+    else version (X86)
+    {
+        private enum isX86 = true;
+    }
+
+    static if (isX86 && __traits(compiles, int4))
+    {
+        enum useSIMD = true;
+    }
+    else
+    {
+        enum useSIMD = false;
+    }
 }
 
 version (useSIMD)
@@ -51,49 +69,21 @@ version (useSIMD)
         import core.simd : int4;
         version (LDC)
         {
-            enum gdcSIMD = false;
             import ldc.simd : loadUnaligned, storeUnaligned;
+            void store16i_sse(void *dest, int4 reg)
+            {
+                storeUnaligned!int4(reg, cast(int*) dest);
+            }
         }
         else version (DigitalMars)
         {
-            enum gdcSIMD = false;
             import core.simd : void16, loadUnaligned, storeUnaligned;
-        }
-        else version (GNU)
-        {
-            // NOTE(stefanos): I could not combine GDC versioning in `useSIMD`.
-            // To know if we can use SIMD for GDC is more complex. We need to:
-            // - Be in x86 arch since the intrinsics (builtins) are only x86 specific.
-            // - Compile the int4 vector size.
-            // TODO(stefanos): The GCC specification points that to use the store intrinsic,
-            // we have to be in SSE2. Is this guaranteed if `int4` compiles?
-            // Note that GCC builtins provide the __builtin_cpu_supports() but this is a runtime
-            // function.
-            version (X86_64)
-            {
-                enum isX86 = true;
-            }
-            else version (X86)
-            {
-                enum isX86 = true;
-            }
-
-            static if (isX86 && __traits(compiles, int4))
-            {
-                enum gdcSIMD = true;
-            }
-            else
+            void store16i_sse(void *dest, int4 reg)
             {
-                memsetNaive(d, val, n);
-                return;
+                storeUnaligned(cast(void16*) dest, reg);
             }
         }
-
-        // TODO(stefanos): Is there a way to make them @safe?
-        // (The problem is that for LDC, they could take int* or float* pointers
-        // but the cast to void16 for DMD is necessary anyway).
-
-        static if (gdcSIMD)
+        else
         {
             import gcc.builtins;
             import core.simd : ubyte16;
@@ -102,21 +92,6 @@ version (useSIMD)
                 __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
             }
         }
-        else
-        {
-            void store16i_sse(void *dest, int4 reg)
-            {
-                version (LDC)
-                {
-                    storeUnaligned!int4(reg, cast(int*) dest);
-                }
-                else
-                {
-                    storeUnaligned(cast(void16*) dest, reg);
-                }
-            }
-
-        }
         void store32i_sse(void *dest, int4 reg)
         {
             store16i_sse(dest, reg);

From 4e6654b6d7c05407d39b986f6a536bfd774eb645 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 17:32:23 +0300
Subject: [PATCH 24/29] Doc improvement

---
 src/core/experimental/memutils.d | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 64e185074b..11df0f22ff 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -4,12 +4,12 @@
  */
 module core.experimental.memutils;
 
-/*
-  If T is an array, set all `dst`'s bytes
-  (whose count is the length of the array times
-  the size of the array element) to `val`.
-  Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
-*/
+/**
+ * If T is an array, set all `dst`'s bytes
+ * (whose count is the length of the array times
+ * the size of the array element) to `val`.
+ * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+ */
 void memset(T)(ref T dst, const ubyte val)
 {
     import core.internal.traits : isArray;
@@ -97,7 +97,6 @@ version (useSIMD)
             store16i_sse(dest, reg);
             store16i_sse(dest+0x10, reg);
         }
-
         // NOTE(stefanos): I use the naive version, which in my benchmarks was slower
         // than the previous classic switch. BUT. Using the switch had a significant
         // drop in the rest of the sizes. It's not the branch that is responsible for the drop,
@@ -157,8 +156,7 @@ else
     }
 }
 
-/*
-  Naive version for when there isn't any vector support (SIMD etc.).
+/* Naive version for when there isn't any vector support (SIMD etc.).
 */
 private void memsetNaive(void *dst, const uint val, size_t n)
 {

From d7b8a0b83f9e0777b650e8dbc744db311297a41a Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 17:37:33 +0300
Subject: [PATCH 25/29] Doc improvement 2

---
 src/core/experimental/memutils.d | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 11df0f22ff..76638f501f 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -9,6 +9,13 @@ module core.experimental.memutils;
  * (whose count is the length of the array times
  * the size of the array element) to `val`.
  * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+ * 
+ * Params
+ *  val = The byte with which we want to fill memory with.
+ *  dst = Memory Destination whose bytes are to be set to `val`.
+ *
+ * Returns:
+ *  Nothing.
  */
 void memset(T)(ref T dst, const ubyte val)
 {

From 83541f783aee82fceacc4dfe4da6be04585d2cc2 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 17:47:30 +0300
Subject: [PATCH 26/29] Minor changes

---
 src/core/experimental/memutils.d | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 76638f501f..5e52ac25b9 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -9,8 +9,8 @@ module core.experimental.memutils;
  * (whose count is the length of the array times
  * the size of the array element) to `val`.
  * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
- * 
- * Params
+ *
+ * Params:
  *  val = The byte with which we want to fill memory with.
  *  dst = Memory Destination whose bytes are to be set to `val`.
  *

From b9bc30c652eb2f52912e6c7800b6be2195f25316 Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Tue, 30 Jul 2019 17:52:55 +0300
Subject: [PATCH 27/29] Changed Returns to N.B.

---
 src/core/experimental/memutils.d | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 5e52ac25b9..51bbc1662d 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -9,13 +9,11 @@ module core.experimental.memutils;
  * (whose count is the length of the array times
  * the size of the array element) to `val`.
  * Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
+ * N.B.: Contrary to the C Standard Library memset(), this functions returns nothing.
  *
  * Params:
  *  val = The byte with which we want to fill memory with.
  *  dst = Memory Destination whose bytes are to be set to `val`.
- *
- * Returns:
- *  Nothing.
  */
 void memset(T)(ref T dst, const ubyte val)
 {

From 1204a8b9d3caafca755902cac69836a56df07bbc Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sat, 3 Aug 2019 15:18:45 +0300
Subject: [PATCH 28/29] Add test for empty array

---
 src/core/experimental/memutils.d | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 51bbc1662d..44ef7e3431 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -251,4 +251,8 @@ unittest
     {
         assert(p[i] == 9);
     }
+
+    // Verify that it does not crash on empty array.
+    ubyte[0] c;
+    memset(c, 9);
 }

From a4c7a8d5486e01fb1571db46fbf0aa3c82c32efd Mon Sep 17 00:00:00 2001
From: Stefanos Baziotis <sdi1600105@di.uoa.gr>
Date: Sat, 3 Aug 2019 15:22:52 +0300
Subject: [PATCH 29/29] Added @nogc nothrow

---
 src/core/experimental/memutils.d | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/core/experimental/memutils.d b/src/core/experimental/memutils.d
index 44ef7e3431..6b5735b9f1 100644
--- a/src/core/experimental/memutils.d
+++ b/src/core/experimental/memutils.d
@@ -15,7 +15,7 @@ module core.experimental.memutils;
  *  val = The byte with which we want to fill memory with.
  *  dst = Memory Destination whose bytes are to be set to `val`.
  */
-void memset(T)(ref T dst, const ubyte val)
+void memset(T)(ref T dst, const ubyte val) nothrow @nogc
 {
     import core.internal.traits : isArray;
     const uint v = cast(uint) val;
@@ -69,13 +69,13 @@ version (useSIMD)
 {
     /* SIMD implementation
      */
-    private void Dmemset(void *d, const uint val, size_t n)
+    private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
     {
         import core.simd : int4;
         version (LDC)
         {
             import ldc.simd : loadUnaligned, storeUnaligned;
-            void store16i_sse(void *dest, int4 reg)
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
             {
                 storeUnaligned!int4(reg, cast(int*) dest);
             }
@@ -83,7 +83,7 @@ version (useSIMD)
         else version (DigitalMars)
         {
             import core.simd : void16, loadUnaligned, storeUnaligned;
-            void store16i_sse(void *dest, int4 reg)
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
             {
                 storeUnaligned(cast(void16*) dest, reg);
             }
@@ -92,12 +92,12 @@ version (useSIMD)
         {
             import gcc.builtins;
             import core.simd : ubyte16;
-            void store16i_sse(void *dest, int4 reg)
+            void store16i_sse(void *dest, int4 reg) nothrow @nogc
             {
                 __builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
             }
         }
-        void store32i_sse(void *dest, int4 reg)
+        void store32i_sse(void *dest, int4 reg) nothrow @nogc
         {
             store16i_sse(dest, reg);
             store16i_sse(dest+0x10, reg);
@@ -155,7 +155,7 @@ else
 {
     /* Forward to simple implementation.
      */
-    private void Dmemset(void *d, const uint val, size_t n)
+    private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
     {
         memsetNaive(d, val, n);
     }
@@ -163,7 +163,7 @@ else
 
 /* Naive version for when there isn't any vector support (SIMD etc.).
 */
-private void memsetNaive(void *dst, const uint val, size_t n)
+private void memsetNaive(void *dst, const uint val, size_t n) nothrow @nogc
 {
     // NOTE(stefanos): DMD could not inline it.
     void handleLT16Sizes(void *d, const ulong v, size_t n)