Skip to content
This repository was archived by the owner on Oct 12, 2022. It is now read-only.
/ druntime Public archive
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
847140e
memutils: Replacement of libc string.h functions - currently only Dme…
baziotis Jul 4, 2019
f991173
Style fix
baziotis Jul 4, 2019
ea2ce59
Versioning fix
baziotis Jul 4, 2019
bac120f
Versioning fix vol. 2
baziotis Jul 4, 2019
ff7e755
Independency of std.traits
baziotis Jul 4, 2019
497e53f
Minor fixes/changes
baziotis Jul 5, 2019
c52c099
Style and layout changes
baziotis Jul 5, 2019
6ebec4b
Moved tests to test folder
baziotis Jul 5, 2019
57552ed
More naming and style changes
baziotis Jul 5, 2019
60b3967
Minor fix
baziotis Jul 5, 2019
4faa8f8
Versioning improvement
baziotis Jul 5, 2019
a161b98
Move std.traits code to core.internal.traits
baziotis Jul 5, 2019
5da39a9
Naming fix
baziotis Jul 5, 2019
cc6d019
Fix in using non-existent code in internal.traits unittests
baziotis Jul 5, 2019
7b9eb3c
Fix for uint vs ubyte in memsetNaive
baziotis Jul 5, 2019
08d044f
Removed escaping from tests in memutils
baziotis Jul 6, 2019
d611a18
Versioning improvement
baziotis Jul 7, 2019
00ca80a
GDC SIMD version and bug fix
baziotis Jul 7, 2019
9ad8f16
Not so naive version of memsetNaive
baziotis Jul 7, 2019
504fc7b
mixin removal in memsetNaive
baziotis Jul 29, 2019
9af240f
Style fix
baziotis Jul 29, 2019
08ffa2c
Doc fix
baziotis Jul 30, 2019
ff81219
SIMD versioning improvement
baziotis Jul 30, 2019
4e6654b
Doc improvement
baziotis Jul 30, 2019
d7b8a0b
Doc improvement 2
baziotis Jul 30, 2019
83541f7
Minor changes
baziotis Jul 30, 2019
b9bc30c
Changed Returns to N.B.
baziotis Jul 30, 2019
1204a8b
Add test for empty array
baziotis Aug 3, 2019
a4c7a8d
Added @nogc nothrow
baziotis Aug 3, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mak/COPY
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ COPY=\
$(IMPDIR)\core\time.d \
$(IMPDIR)\core\vararg.d \
\
$(IMPDIR)\core\experimental\memutils.d \
\
$(IMPDIR)\core\internal\abort.d \
$(IMPDIR)\core\internal\arrayop.d \
$(IMPDIR)\core\internal\convert.d \
Expand Down
2 changes: 2 additions & 0 deletions mak/DOCS
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ DOCS=\
$(DOCDIR)\core_gc_config.html \
$(DOCDIR)\core_gc_gcinterface.html \
$(DOCDIR)\core_gc_registry.html \
\
$(DOCDIR)\core_experimental_memutils.html \
\
$(DOCDIR)\core_stdc_assert_.html \
$(DOCDIR)\core_stdc_config.html \
Expand Down
2 changes: 2 additions & 0 deletions mak/SRCS
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ SRCS=\
src\core\thread.d \
src\core\time.d \
src\core\vararg.d \
\
src\core\experimental\memutils.d \
\
src\core\gc\config.d \
src\core\gc\gcinterface.d \
Expand Down
3 changes: 3 additions & 0 deletions mak/WINDOWS
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ $(IMPDIR)\core\gc\gcinterface.d : src\core\gc\gcinterface.d
$(IMPDIR)\core\gc\registry.d : src\core\gc\registry.d
copy $** $@

$(IMPDIR)\core\experimental\memutils.d : src\core\experimental\memutils.d
copy $** $@

$(IMPDIR)\core\internal\abort.d : src\core\internal\abort.d
copy $** $@

Expand Down
258 changes: 258 additions & 0 deletions src/core/experimental/memutils.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
/**
* Pure D replacement of the C Standard Library basic memory building blocks of string.h
* Source: $(DRUNTIMESRC core/experimental/memutils.d)
*/
module core.experimental.memutils;

/**
* If T is an array, set all `dst`'s bytes
* (whose count is the length of the array times
* the size of the array element) to `val`.
* Otherwise, set T.sizeof bytes to `val` starting from the address of `dst`.
* N.B.: Contrary to the C Standard Library memset(), this functions returns nothing.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why deviate from c stdlib memset on this point?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because these utilities were not created with the idea to have the exact C interface. We decided to drop legacy C stuff that seem useless. Like the return value and the fact that memset gets an int instead of a byte.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not saying this is necessarily the best idea ever, because no matter how irrelevant legacy stuff is, the thing is, people have been used to that for years.

*
* Params:
* val = The byte with which we want to fill memory with.
* dst = Memory Destination whose bytes are to be set to `val`.
*/
void memset(T)(ref T dst, const ubyte val) nothrow @nogc
{
import core.internal.traits : isArray;
const uint v = cast(uint) val;
static if (isArray!T)
{
size_t n = dst.length * typeof(dst[0]).sizeof;
Dmemset(dst.ptr, v, n);
}
else
{
Dmemset(&dst, v, T.sizeof);
}
}

version (D_SIMD)
{
import core.simd : float4;
enum useSIMD = true;
}
else version (LDC)
{
// LDC always supports SIMD (but doesn't ever set D_SIMD) and
// the back-end uses the most appropriate size for every target.
import core.simd : float4;
enum useSIMD = true;
}
else version (GNU)
{
import core.simd : float4;
// GNU does not support SIMD by default.
version (X86_64)
{
private enum isX86 = true;
}
else version (X86)
{
private enum isX86 = true;
}

static if (isX86 && __traits(compiles, int4))
{
enum useSIMD = true;
}
else
{
enum useSIMD = false;
}
}

version (useSIMD)
{
/* SIMD implementation
*/
private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
Copy link
Contributor

@TurkeyMan TurkeyMan Aug 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't see a path for val == 0... It is usually possible to make zeromem substantially faster.
But again, none of this code should apply to LDC/GDC, those compilers have intrinsics for this stuff; this should be considered strictly for DMD.
For instance, on PowerPC, there's an instruction dcbz (data cache-block zero) which resets an entire 256byte cache line to zero in one instantaneous operation... I can't see your PPC branch that takes advantage of 256byte alignment and dcbz... but the backends all do.
Assuming this is targetting DMD, then your assumptions that the compiler won't do anything interesting, and that the target is x64 are both reasonable assumptions, otherwise I can't get behind any of this code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't see a path for val == 0... It is usually possible to make zeromem substantially faster.

No memset that I know of does anything different for val == 0 and targets x64.

But again, none of this code should apply to LDC/GDC, those compilers have intrinsics for this stuff; this should be considered strictly for DMD.

Check this: #2687 (comment)

PowerPC was not a target of this project.

Assuming this is targetting DMD, then your assumptions that the compiler won't do anything interesting, and that the target is x64 are both reasonable assumptions, otherwise I can't get behind any of this code.

The focus was initially DMD, then LDC and GDC by not using their intrinsics. It's again the same question - What happens when libc is not available and do we care?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PowerPC was not a target of this project.

Right, that's my point, and why I think it's not a good idea to re-implement these functions. There's too many versions of them to worry about, and maintaining great perf is a moving target as arch like x86 and arm evolve.

What happens when libc is not available and do we care?

libc is always available... what is a case when it's not?
I'd also like to know if the intrinsics definitely call out to libc, or if the intrinsics call into the compilers runtime lib? What is the symbol that the intrinsics call?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have to look at this. Let me be clear again, I don't know for a fact that they call libc, it just seemed that way as they execute the same code. I'll definitely have to look at that if we move forward with this PR.

{
import core.simd : int4;
version (LDC)
{
import ldc.simd : loadUnaligned, storeUnaligned;
void store16i_sse(void *dest, int4 reg) nothrow @nogc
{
storeUnaligned!int4(reg, cast(int*) dest);
}
}
else version (DigitalMars)
{
import core.simd : void16, loadUnaligned, storeUnaligned;
void store16i_sse(void *dest, int4 reg) nothrow @nogc
{
storeUnaligned(cast(void16*) dest, reg);
}
}
else
{
import gcc.builtins;
import core.simd : ubyte16;
void store16i_sse(void *dest, int4 reg) nothrow @nogc
{
__builtin_ia32_storedqu(cast(char*) dest, cast(ubyte16) reg);
}
}
void store32i_sse(void *dest, int4 reg) nothrow @nogc
{
store16i_sse(dest, reg);
store16i_sse(dest+0x10, reg);
}
// NOTE(stefanos): I use the naive version, which in my benchmarks was slower
// than the previous classic switch. BUT. Using the switch had a significant
// drop in the rest of the sizes. It's not the branch that is responsible for the drop,
// but the fact that it's more difficult to optimize it as part of the rest of the code.
if (n < 32)
{
memsetNaive(d, val, n);
return;
}
void *temp = d + n - 0x10; // Used for the last 32 bytes
const uint v = val * 0x01010101; // Broadcast c to all 4 bytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mul is almost never a good idea. I suspect the mul may be a bottleneck in the ~32-128 byte range. If you're gonna do it, why not extend to 8 bytes and get more value from it?

Do you have a reference for this code? It doesn't look very optimal to me. If you're gonna use SSE, there are broadcast and permute functions, which not introduce hazards as bad as mul...

Copy link
Contributor Author

@baziotis baziotis Aug 5, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's inspired by Agner Fog. I didn't copy code from him but I have read his optimization manual. This mul trick is his. And no, a mul in x86 is not a problem nowadays.

Our code turned out to be similar e.g. his AVX version: https://github.com/tpn/agner/blob/master/asmlib/asmlibSrc/memset64.asm#L188

It's also inspired by GCC.

Edit: Oh, and there's no point in doing in 8 bytes. It is immediately broadcasted in an XMM register.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm hyper-aware of this mul trick, I've been doing this for decades, but it absolutely IS a problem 'nowdays'... I've written a lot of code using mul-tricks like this, and been surprised when they become an unintuitive bottleneck.
It might not be a problem here, because I think it's usually the case that memset is not called in a hot-loop, but multiplies usually have a lot of limitations on the pipeline.
There's usually only one imul pipeline, and while the latency might not be so bad these days, the circuit is kinda recursive, so it can't accept a new instruction per cycle like add/sub circuits, so if there are multiple mul's contending for the limited imul resource, they create stalling hazards on each other.
imul is often used in address calculation, so there's a reasonable possibility of contention, but there's an if above this, so the pipeline might already have flushed...

I do a lot of work with colour precision scaling using this same trick; the imul is almost always the limiting factor in my loops.

Anyway, I only say this because you're shoving the value straight into SSE in the following line, where you can use much faster permute operations to broadcast the value very easily (like pshufb, or complements on other architectures).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't know all that stuff about muls limiting the pipeline. I just knew that the cycles they consume are about the same as say add / sub. Is there somewhere I can read more about those?

But the probabilities of Agner not having thought this is close to 0. :P

// Broadcast v to all bytes.
auto xmm0 = int4(v);
ubyte rem = cast(ubyte) d & 15; // Remainder from the previous 16-byte boundary.
// Store 16 bytes, from which some will possibly overlap on a future store.
// For example, if the `rem` is 7, we want to store 16 - 7 = 9 bytes unaligned,
// add 16 - 7 = 9 to `d` and start storing aligned. Since 16 - `rem` can be at most
// 16, we store 16 bytes anyway.
store16i_sse(d, xmm0);
d += 16 - rem;
n -= 16 - rem;
// Move in blocks of 32.
if (n >= 32)
{
// Align to (previous) multiple of 32. That does something invisible to the code,
// but a good optimizer will avoid a `cmp` instruction inside the loop. With a
// multiple of 32, the end of the loop can be (if we assume that `n` is in RDX):
// sub RDX, 32;
// jge START_OF_THE_LOOP.
// Without that, it has to be:
// sub RDX, 32;
// cmp RDX, 32;
// jge START_OF_THE_LOOP
// NOTE, that we align on a _previous_ multiple (for 37, we will go to 32). That means
// we have somehow to compensate for that, which is done at the end of this function.
n &= -32;
do
{
store32i_sse(d, xmm0);
// NOTE(stefanos): I tried avoiding this operation on `d` by combining
// `d` and `n` in the above loop and going backwards. It was slower in my benchs.
d += 32;
n -= 32;
} while (n >= 32);
}
// Compensate for the last (at most) 32 bytes.
store32i_sse(temp-0x10, xmm0);
}
}
else
{
/* Forward to simple implementation.
*/
private void Dmemset(void *d, const uint val, size_t n) nothrow @nogc
{
memsetNaive(d, val, n);
}
}

/* Naive version for when there isn't any vector support (SIMD etc.).
*/
private void memsetNaive(void *dst, const uint val, size_t n) nothrow @nogc
{
// NOTE(stefanos): DMD could not inline it.
void handleLT16Sizes(void *d, const ulong v, size_t n)
{
switch (n)
{
case 6:
*(cast(uint*) (d+2)) = cast(uint) v;
goto case 2; // fall-through
case 2:
*(cast(ushort*) d) = cast(ushort) v;
return;

case 7:
*(cast(uint*) (d+3)) = cast(uint) v;
goto case 3; // fall-through
case 3:
*(cast(ushort*) (d+1)) = cast(ushort) v;
goto case 1; // fall-through
case 1:
*(cast(ubyte*) d) = cast(ubyte) v;
return;

case 4:
*(cast(uint*) d) = cast(uint) v;
return;
case 0:
return;

case 5:
*(cast(uint*) (d+1)) = cast(uint) v;
*(cast(ubyte*) d) = cast(ubyte) v;
return;
default:
}
}


const ulong v = cast(ulong) val * 0x0101010101010101; // Broadcast c to all 8 bytes
if (n < 8)
{
handleLT16Sizes(dst, v, n);
return;
}
// NOTE(stefanos): Normally, we would have different alignment
// for 32-bit and 64-bit versions. For the sake of simplicity,
// we'll let the compiler do the work.
ubyte rem = cast(ubyte) dst & 7;
if (rem)
{ // Unaligned
// Move 8 bytes (which we will possibly overlap later).
*(cast(ulong*) dst) = v;
dst += 8 - rem;
n -= 8 - rem;
}
ulong *d = cast(ulong*) dst;
ulong temp = n / 8;
// Go in steps of 8 - the register size in x86_64.
for (size_t i = 0; i != temp; ++i)
{
*d = v;
++d;
n -= 8;
}
dst = cast(void *) d;

handleLT16Sizes(dst, v, n);
}


/** Core features tests.
*/
unittest
{
ubyte[3] a;
memset(a, 7);
assert(a[0] == 7);
assert(a[1] == 7);
assert(a[2] == 7);

real b;
memset(b, 9);
ubyte *p = cast(ubyte*) &b;
foreach (i; 0 .. b.sizeof)
{
assert(p[i] == 9);
}

// Verify that it does not crash on empty array.
ubyte[0] c;
memset(c, 9);
}
Loading