From 76aed3a29da8c0c179ddd9b135af88de48bbc316 Mon Sep 17 00:00:00 2001 From: helloguo Date: Wed, 14 Sep 2016 23:15:26 -0700 Subject: [PATCH 1/2] update JIT_MemSet/MemCpy, Buffer::BlockCopy and Buffer::InternalBlockCopy --- src/vm/amd64/CrtHelpers.asm | 742 ++++++++++++------------------------ src/vm/comutilnative.cpp | 8 + 2 files changed, 253 insertions(+), 497 deletions(-) diff --git a/src/vm/amd64/CrtHelpers.asm b/src/vm/amd64/CrtHelpers.asm index 6ec6e4d2a9fd..9e883353dd7f 100644 --- a/src/vm/amd64/CrtHelpers.asm +++ b/src/vm/amd64/CrtHelpers.asm @@ -13,516 +13,264 @@ ; *********************************************************************** include AsmMacros.inc -include asmconstants.inc -; JIT_MemSet/JIT_MemCpy -; -; It is IMPORANT that the exception handling code is able to find these guys -; on the stack, but to keep them from being tailcalled by VC++ we need to turn -; off optimization and it ends up being a wasteful implementation. -; -; Hence these assembly helpers. -; - - -;*** -;memset.asm - set a section of memory to all one byte -; -; Licensed to the .NET Foundation under one or more agreements. -; The .NET Foundation licenses this file to you under the MIT license. -; See the LICENSE file in the project root for more information.; -; -;******************************************************************************* - -;*** ;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value" -; -;Purpose: -; Sets the first "count" bytes of the memory starting -; at "dst" to the character value "value". -; -; Algorithm: -; char * -; memset (dst, value, count) -; char *dst; -; char value; -; unsigned int count; -; { -; char *start = dst; -; -; while (count--) -; *dst++ = value; -; return(start); -; } -; -;Entry: -; char *dst - pointer to memory to fill with value -; char value - value to put in dst bytes -; int count - number of bytes of dst to fill -; -;Exit: -; returns dst, with filled bytes -; -;Uses: -; -;Exceptions: -; -;******************************************************************************* - -CACHE_LIMIT_MEMSET equ 070000h ; limit for nontemporal fill - LEAF_ENTRY JIT_MemSet, _TEXT - mov rax, rcx ; save destination address - cmp r8, 8 ; check if 8 bytes to fill - jb short mset40 ; if b, less than 8 bytes to fill movzx edx, dl ; set fill pattern - mov r9, 0101010101010101h ; replicate fill over 8 bytes - imul rdx, r9 ; - cmp r8, 64 ; check if 64 bytes to fill - jb short mset20 ; if b, less than 64 bytes - -; -; Large block - fill alignment bytes. -; - -mset00: neg rcx ; compute bytes to alignment - and ecx, 7 ; - jz short mset10 ; if z, no alignment required - sub r8, rcx ; adjust remaining bytes by alignment - mov [rax], rdx ; fill alignment bytes -mset10: add rcx, rax ; compute aligned destination address - -; -; Attempt to fill 64-byte blocks -; - - mov r9, r8 ; copy count of bytes remaining - and r8, 63 ; compute remaining byte count - shr r9, 6 ; compute number of 64-byte blocks - test r9, r9 ; remove partial flag stall caused by shr - jnz short mset70 ; if nz, 64-byte blocks to fill - -; -; Fill 8-byte bytes. -; - -mset20: mov r9, r8 ; copy count of bytes remaining - and r8, 7 ; compute remaining byte count - shr r9, 3 ; compute number of 8-byte blocks - test r9, r9 ; remove partial flag stall caused by shr - jz short mset40 ; if z, no 8-byte blocks - - align ; simpler way to align instrucitons - -mset30: mov [rcx], rdx ; fill 8-byte blocks - add rcx, 8 ; advance to next 8-byte block - dec r9 ; decrement loop count - jnz short mset30 ; if nz, more 8-byte blocks - -; -; Fill residual bytes. -; - -mset40: test r8, r8 ; test if any bytes to fill - jz short mset60 ; if z, no bytes to fill -mset50: mov [rcx], dl ; fill byte - inc rcx ; advance to next byte - dec r8 ; decrement loop count - jnz short mset50 ; if nz, more bytes to fill -mset60: - ; for some reason the assembler doesn't like the REPRET macro on the same line as a label - REPRET ; return - -; -; Fill 64-byte blocks. -; - - align 16 - - db 066h, 066h, 066h, 090h - db 066h, 066h, 090h - -mset70: cmp r9, CACHE_LIMIT_MEMSET / 64 ; check if large fill - jae short mset90 ; if ae, large fill -mset80: mov [rcx], rdx ; fill 64-byte block - mov 8[rcx], rdx ; - mov 16[rcx], rdx ; - add rcx, 64 ; advance to next block - mov (24 - 64)[rcx], rdx ; - mov (32 - 64)[rcx], rdx ; - dec r9 ; decrement loop count - mov (40 - 64)[rcx], rdx ; - mov (48 - 64)[rcx], rdx ; - mov (56 - 64)[rcx], rdx ; - jnz short mset80 ; if nz, more 64-byte blocks - jmp short mset20 ; finish in common code - -; -; Fill 64-byte blocks nontemporal. -; - - align - -mset90: movnti [rcx], rdx ; fill 64-byte block - movnti 8[rcx], rdx ; - movnti 16[rcx], rdx ; - add rcx, 64 ; advance to next block - movnti (24 - 64)[rcx], rdx ; - movnti (32 - 64)[rcx], rdx ; - dec r9 ; decrement loop count - movnti (40 - 64)[rcx], rdx ; - movnti (48 - 64)[rcx], rdx ; - movnti (56 - 64)[rcx], rdx ; - jnz short mset90 ; if nz, move 64-byte blocks - lock or byte ptr [rsp], 0 ; flush data to memory - jmp mset20 ; finish in common code + mov r9, 0101010101010101h + imul rdx, r9 ; rdx is 8 bytes filler + + cmp r8, 16 + jbe mset04 + + cmp r8, 512 + jbe mset00 + + ; count > 512 + mov r10, rcx ; save dst address + mov r11, rdi ; save rdi + mov eax, edx ; eax is value + mov rdi, rcx ; rdi is dst + mov rcx, r8 ; rcx is count + rep stosb + mov rdi, r11 ; restore rdi + mov rax, r10 + ret + + align 16 +mset00: mov rax, rcx ; save dst address + movd xmm0, rdx + punpcklbw xmm0, xmm0 ; xmm0 is 16 bytes filler + + cmp r8, 128 + jbe mset02 + + ; count > 128 && count <= 512 + mov r9, r8 + shr r9, 7 ; count/128 + + align 16 +mset01: movdqu [rcx], xmm0 + movdqu 16[rcx], xmm0 + movdqu 32[rcx], xmm0 + movdqu 48[rcx], xmm0 + movdqu 64[rcx], xmm0 + movdqu 80[rcx], xmm0 + movdqu 96[rcx], xmm0 + movdqu 112[rcx], xmm0 + add rcx, 128 + dec r9 + jnz mset01 + and r8, 7fh ; and r8 with 0111 1111 + + ; the remainder is from 0 to 127 + cmp r8, 16 + jnbe mset02 + + ; the remainder <= 16 + movdqu -16[rcx + r8], xmm0 + ret + + ; count > 16 && count <= 128 for mset02 + align 16 +mset02: movdqu [rcx], xmm0 + movdqu -16[rcx + r8], xmm0 + cmp r8, 32 + jbe mset03 + + ; count > 32 && count <= 64 + movdqu 16[rcx], xmm0 + movdqu -32[rcx + r8], xmm0 + cmp r8, 64 + jbe mset03 + + ; count > 64 && count <= 128 + movdqu 32[rcx], xmm0 + movdqu 48[rcx], xmm0 + movdqu -48[rcx + r8], xmm0 + movdqu -64[rcx + r8], xmm0 +mset03: ret + + align 16 +mset04: mov rax, rcx ; save dst address + test r8b, 24 ; and r8b with 0001 1000 + jz mset05 + + ; count >= 8 && count <= 16 + mov [rcx], rdx + mov -8[rcx + r8], rdx + ret + + align 16 +mset05: test r8b, 4 ; and r8b with 0100 + jz mset06 + + ; count >= 4 && count < 8 + mov [rcx], edx + mov -4[rcx + r8], edx + ret + + ; count >= 0 && count < 4 + align 16 +mset06: test r8b, 1 ; and r8b with 0001 + jz mset07 + mov [rcx],dl +mset07: test r8b, 2 ; and r8b with 0010 + jz mset08 + mov -2[rcx + r8], dx +mset08: ret LEAF_END_MARKED JIT_MemSet, _TEXT -;******************************************************************************* -; This ensures that atomic updates of aligned fields will stay atomic. -;*** -;JIT_MemCpy - Copy source buffer to destination buffer -; -;Purpose: -;JIT_MemCpy - Copy source buffer to destination buffer -; -;Purpose: -; JIT_MemCpy() copies a source memory buffer to a destination memory -; buffer. This routine recognize overlapping buffers to avoid propogation. -; For cases where propogation is not a problem, memcpy() can be used. -; -;Entry: -; void *dst = pointer to destination buffer -; const void *src = pointer to source buffer -; size_t count = number of bytes to copy -; -;Exit: -; Returns a pointer to the destination buffer in AX/DX:AX -; -;Uses: -; CX, DX -; -;Exceptions: -;******************************************************************************* -; This ensures that atomic updates of aligned fields will stay atomic. - -CACHE_LIMIT_MEMMOV equ 040000h ; limit for nontemporal fill -CACHE_BLOCK equ 01000h ; nontemporal move block size - +; from 0 to 64 bytes, no need to check overlap +; because we use xmm0 to xmm3 to store the src buffer LEAF_ENTRY JIT_MemCpy, _TEXT - mov r11, rcx ; save destination address - sub rdx, rcx ; compute offset to source buffer - jb mmov10 ; if b, destination may overlap - cmp r8, 8 ; check if 8 bytes to move - jb short mcpy40 ; if b, less than 8 bytes to move - -; -; Move alignment bytes. -; - - test cl, 7 ; test if destination aligned - jz short mcpy20 ; if z, destination aligned - test cl, 1 ; test if byte move needed - jz short mcpy00 ; if z, byte move not needed - mov al, [rcx + rdx] ; move byte - dec r8 ; decrement byte count - mov [rcx], al ; - inc rcx ; increment destination address -mcpy00: test cl, 2 ; test if word move needed - jz short mcpy10 ; if z, word move not needed - mov ax, [rcx + rdx] ; move word - sub r8, 2 ; reduce byte count - mov [rcx], ax ; - add rcx, 2 ; advance destination address -mcpy10: test cl, 4 ; test if dword move needed - jz short mcpy20 ; if z, dword move not needed - mov eax, [rcx + rdx] ; move dword - sub r8, 4 ; reduce byte count - mov [rcx], eax ; - add rcx, 4 ; advance destination address - -; -; Attempt to move 32-byte blocks. -; - -mcpy20: mov r9, r8 ; copy count of bytes remaining - shr r9, 5 ; compute number of 32-byte blocks - test r9, r9 ; v-liti, remove partial flag stall caused by shr - jnz short mcpy60 ; if nz, 32-byte blocks to fill - - align -; -; Move 8-byte blocks. -; - -mcpy25: mov r9, r8 ; copy count of bytes remaining - shr r9, 3 ; compute number of 8-byte blocks - test r9, r9 ; v-liti, remove partial flag stall caused by shr - jz short mcpy40 ; if z, no 8-byte blocks - align - -mcpy30: mov rax, [rcx + rdx] ; move 8-byte blocks - mov [rcx], rax ; - add rcx, 8 ; advance destination address - dec r9 ; decrement loop count - jnz short mcpy30 ; if nz, more 8-byte blocks - and r8, 7 ; compute remaining byte count - -; -; Test for residual bytes. -; - -mcpy40: test r8, r8 ; test if any bytes to move - jnz short mcpy50 ; if nz, residual bytes to move - mov rax, r11 ; set destination address - ret ; - -; -; Move residual bytes. -; - - align - -mcpy50: mov al, [rcx + rdx] ; move byte - mov [rcx], al ; - inc rcx ; increment destiantion address - dec r8 ; decrement loop count - jnz short mcpy50 ; if nz, more bytes to fill - mov rax, r11 ; set destination address - ret ; return - -; -; Move 32 byte blocks -; - - align 16 - - db 066h, 066h, 066h, 090h - db 066h, 066h, 090h + mov rax, rcx ; save dst address + cmp r8, 16 + jbe mcpy02 + + cmp r8, 64 + jnbe mcpy07 + + ; count > 16 && count <= 64 + align 16 +mcpy00: movdqu xmm0, [rdx] + movdqu xmm1, -16[rdx + r8] ; save 16 to 32 bytes src + cmp r8, 32 + jbe mcpy01 + + movdqu xmm2, 16[rdx] + movdqu xmm3, -32[rdx + r8] ; save 32 to 64 bytes src + + ;count > 32 && count <= 64 + movdqu 16[rcx], xmm2 + movdqu -32[rcx + r8], xmm3 + + ;count > 16 && count <= 32 +mcpy01: movdqu [rcx], xmm0 + movdqu -16[rcx + r8], xmm1 + ret + + ; count <= 16 + align 16 +mcpy02: test r8b, 24 ; test count with 0001 1000 + jz mcpy03 + ; count >= 8 && count <= 16 + mov r9, [rdx] + mov r10, -8[rdx + r8] + mov [rcx], r9 + mov -8[rcx + r8], r10 + ret + + align 16 +mcpy03: test r8b, 4 ; test count with 0100 + jz mcpy04 + ; count >= 4 && count < 8 + mov r9d, [rdx] + mov r10d, -4[rdx + r8] + mov [rcx], r9d + mov -4[rcx + r8], r10d + ret + + ; count >= 0 && count < 4 + align 16 +mcpy04: test r8, r8 + jz mcpy06 ; count == 1/2/3 + mov r9b, [rdx] ; save the first byte + + test r8b, 2 ; test count with 0010 + jz mcpy05 + mov r10w, -2[rdx + r8] + mov -2[rcx + r8], r10w +mcpy05: mov [rcx], r9b +mcpy06: ret + + align 16 + ; count > 64, we need to check overlap +mcpy07: mov r9, rdx ; r9 is src address + sub r9, rcx ; if src - dst < 0 jump to mcpy11 + jb mcpy11 ; if b, destination may overlap + +mcpy08: cmp r8, 512 + jnbe mcpy10 + + ; count > 64 && count <= 512 + mov r9, r8 + shr r9, 6 ; count/64 + + align 16 +mcpy09: movdqu xmm0, [rdx] + movdqu xmm1, 16[rdx] + movdqu xmm2, 32[rdx] + movdqu xmm3, 48[rdx] + movdqu [rcx], xmm0 + movdqu 16[rcx], xmm1 + movdqu 32[rcx], xmm2 + movdqu 48[rcx], xmm3 + add rdx, 64 + add rcx, 64 + dec r9 + jnz mcpy09 + + ; the remainder is from 0 to 63 + and r8, 3fh ; and with 0011 1111 + cmp r8, 16 + jnbe mcpy00 + + ; the remainder <= 16 + jmp mcpy02 + ret + + ; count > 512 + align 16 +mcpy10: mov r10, rdi ; save rdi + mov r11, rsi ; save rsi + mov rdi, rcx ; rdi is dst + mov rsi, rdx ; rsi is src + mov rcx, r8 ; rcx is count + rep movsb ; mov from rsi to rdi + mov rsi, r11 ; restore rsi + mov rdi, r10 ; restore rdi + ret -mcpy60: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move - jae short mcpy80 ; if ae, large move -mcpy70: mov rax, [rcx + rdx] ; move 32-byte block - mov r10, 8[rcx + rdx] ; - add rcx, 32 ; advance destination address - mov (-32)[rcx], rax ; - mov (-24)[rcx], r10 ; - mov rax, (-16)[rcx + rdx] ; - mov r10, (-8)[rcx + rdx] ; - dec r9 ; - mov (-16)[rcx], rax ; - mov (-8)[rcx], r10 ; - jnz short mcpy70 ; if nz, more 32-byte blocks - and r8, 31 ; compute remaining byte count - jmp mcpy25 ; - -; -; Move 64-byte blocks nontemporal. -; - - align - - db 066h, 090h - -mcpy80: cmp rdx, CACHE_BLOCK ; check if cache block spacing - jb short mcpy70 ; if b, not cache block spaced -mcpy81: mov eax, CACHE_BLOCK / 128 ; set loop count -mcpy85: prefetchnta [rcx + rdx] ; prefetch 128 bytes - prefetchnta 64[rcx + rdx] ; - add rcx, 128 ; advance source address - dec eax ; decrement loop count - jnz short mcpy85 ; if nz, more to prefetch - sub rcx, CACHE_BLOCK ; reset source address - mov eax, CACHE_BLOCK / 64 ; set loop count -mcpy90: mov r9, [rcx + rdx] ; move 64-byte block - mov r10, 8[rcx + rdx] ; - movnti [rcx], r9 ; - movnti 8[rcx], r10 ; - mov r9, 16[rcx + rdx] ; - mov r10, 24[rcx + rdx] ; - movnti 16[rcx], r9 ; - movnti 24[rcx], r10 ; - mov r9, 32[rcx + rdx] ; - mov r10, 40[rcx + rdx] ; - add rcx, 64 ; advance destination address - movnti (32 - 64)[rcx], r9 ; - movnti (40 - 64)[rcx], r10 ; - mov r9, (48 - 64)[rcx + rdx] ; - mov r10, (56 - 64)[rcx + rdx] ; - dec eax ; - movnti (48 - 64)[rcx], r9 ; - movnti (56 - 64)[rcx], r10 ; - jnz short mcpy90 ; if nz, more 32-byte blocks - sub r8, CACHE_BLOCK ; reduce remaining length - cmp r8, CACHE_BLOCK ; check if cache block remains - jae mcpy81 ; if ae, cache block remains - lock or byte ptr [rsp], 0 ; flush data to memory - jmp mcpy20 ; - -; ; The source address is less than the destination address. -; - - align - - db 066h, 066h, 066h, 090h - db 066h, 066h, 066h, 090h - db 066h, 090h - -mmov10: add rcx, r8 ; compute ending destination address - cmp r8, 8 ; check if 8 bytes to move - jb short mmov60 ; if b, less than 8 bytes to move - -; -; Move alignment bytes. -; - - test cl, 7 ; test if destination aligned - jz short mmov30 ; if z, destination aligned - test cl, 1 ; test if byte move needed - jz short mmov15 ; if z, byte move not needed - dec rcx ; decrement destination address - mov al, [rcx + rdx] ; move byte - dec r8 ; decrement byte count - mov [rcx], al ; -mmov15: test cl, 2 ; test if word move needed - jz short mmov20 ; if z, word move not needed - sub rcx, 2 ; reduce destination address - mov ax, [rcx + rdx] ; move word - sub r8, 2 ; reduce byte count - mov [rcx], ax ; -mmov20: test cl, 4 ; test if dword move needed - jz short mmov30 ; if z, dword move not needed - sub rcx, 4 ; reduce destination address - mov eax, [rcx + rdx] ; move dword - sub r8, 4 ; reduce byte count - mov [rcx], eax ; -; -; Attempt to move 32-byte blocks -; - -mmov30: mov r9, r8 ; copy count of bytes remaining - shr r9, 5 ; compute number of 32-byte blocks - test r9, r9 ; v-liti, remove partial flag stall caused by shr - jnz short mmov80 ; if nz, 32-byte blocks to fill - -; -; Move 8-byte blocks. -; - align - -mmov40: mov r9, r8 ; copy count of bytes remaining - shr r9, 3 ; compute number of 8-byte blocks - test r9, r9 ; v-liti, remove partial flag stall caused by shr - jz short mmov60 ; if z, no 8-byte blocks - - align - -mmov50: sub rcx, 8 ; reduce destination address - mov rax, [rcx + rdx] ; move 8-byte blocks - dec r9 ; decrement loop count - mov [rcx], rax ; - jnz short mmov50 ; if nz, more 8-byte blocks - and r8, 7 ; compute remaining byte count - -; -; Test for residual bytes. -; - -mmov60: test r8, r8 ; test if any bytes to move - jnz short mmov70 ; if nz, residual bytes to move - mov rax, r11 ; set destination address - ret ; - -; -; Move residual bytes. -; - - align - -mmov70: dec rcx ; decrement destination address - mov al, [rcx + rdx] ; move byte - dec r8 ; decrement loop count - mov [rcx], al ; - jnz short mmov70 ; if nz, more bytes to fill - mov rax, r11 ; set destination address - ret ; return - -; -; Move 32 byte blocks -; - - align 16 - - db 066h, 066h, 066h, 090h - db 066h, 066h, 090h - -mmov80: cmp r9, CACHE_LIMIT_MEMMOV / 32 ; check if large move - jae short mmov93 ; if ae, large move -mmov90: mov rax, (-8)[rcx + rdx] ; move 32-byte block - mov r10, (-16)[rcx + rdx] ; - sub rcx, 32 ; reduce destination address - mov 24[rcx], rax ; - mov 16[rcx], r10 ; - mov rax, 8[rcx + rdx] ; - mov r10, [rcx + rdx] ; - dec r9 ; - mov 8[rcx], rax ; - mov [rcx], r10 ; - jnz short mmov90 ; if nz, more 32-byte blocks - and r8, 31 ; compute remaining byte count - jmp mmov40 ; - -; -; Move 64-byte blocks nontemporal. -; - - align - - db 066h, 090h - -mmov93: cmp rdx, -CACHE_BLOCK ; check if cache block spacing - ja short mmov90 ; if a, not cache block spaced -mmov94: mov eax, CACHE_BLOCK / 128 ; set loop count -mmov95: sub rcx, 128 ; reduce destination address - prefetchnta [rcx + rdx] ; prefetch 128 bytes - prefetchnta 64[rcx + rdx] ; - dec eax ; decrement loop count - jnz short mmov95 ; if nz, more to prefetch - add rcx, CACHE_BLOCK ; reset source address - mov eax, CACHE_BLOCK / 64 ; set loop count -mmov97: mov r9, (-8)[rcx + rdx] ; move 64-byte block - mov r10, (-16)[rcx + rdx] ; - movnti (-8)[rcx], r9 ; - movnti (-16)[rcx], r10 ; - mov r9, (-24)[rcx + rdx] ; - mov r10, (-32)[rcx + rdx] ; - movnti (-24)[rcx], r9 ; - movnti (-32)[rcx], r10 ; - mov r9, (-40)[rcx + rdx] ; - mov r10, (-48)[rcx + rdx] ; - sub rcx, 64 ; reduce destination address - movnti (64 - 40)[rcx], r9 ; - movnti (64 - 48)[rcx], r10 ; - mov r9, (64 - 56)[rcx + rdx] ; - mov r10, (64 - 64)[rcx + rdx] ; - dec eax ; decrement loop count - movnti (64 - 56)[rcx], r9 ; - movnti (64 - 64)[rcx], r10 ; - jnz short mmov97 ; if nz, more 32-byte blocks - sub r8, CACHE_BLOCK ; reduce remaining length - cmp r8, CACHE_BLOCK ; check if cache block remains - jae mmov94 ; if ae, cache block remains - lock or byte ptr [rsp], 0 ; flush data to memory - jmp mmov30 ; + align 16 +mcpy11: add r9, r8 ; src - dst + count + cmp r9, 0 ; src + count < = dst jump to mcpy08 + jle mcpy08 + + lea r9, [rdx + r8] ; r9 is the src + count + lea r10, [rcx + r8] ; r10 is the dst + count + + mov r11, r8 + shr r11, 6 ; count/64 + + ; count > 64 + align 16 +mcpy12: movdqu xmm0, -16[r9] + movdqu xmm1, -32[r9] + movdqu xmm2, -48[r9] + movdqu xmm3, -64[r9] + movdqu -16[r10], xmm0 + movdqu -32[r10], xmm1 + movdqu -48[r10], xmm2 + movdqu -64[r10], xmm3 + sub r9, 64 + sub r10, 64 + dec r11 + jnz mcpy12 + + ; the remainder is from 0 to 63 + and r8, 3fh ; and with 0011 1111 + cmp r8, 16 + jnbe mcpy00 + + ; the remainder <= 16 + jmp mcpy02 LEAF_END_MARKED JIT_MemCpy, _TEXT - - - end - + end \ No newline at end of file diff --git a/src/vm/comutilnative.cpp b/src/vm/comutilnative.cpp index 0f27542e1d3d..d98acd5dcdce 100644 --- a/src/vm/comutilnative.cpp +++ b/src/vm/comutilnative.cpp @@ -1478,7 +1478,11 @@ FCIMPL5(VOID, Buffer::BlockCopy, ArrayBase *src, int srcOffset, ArrayBase *dst, PTR_BYTE dstPtr = dst->GetDataPtr() + dstOffset; if ((srcPtr != dstPtr) && (count > 0)) { +#if defined(_WIN64) + JIT_MemCpy(dstPtr, srcPtr, count); +#else memmove(dstPtr, srcPtr, count); +#endif } FC_GC_POLL(); @@ -1524,7 +1528,11 @@ FCIMPL5(VOID, Buffer::InternalBlockCopy, ArrayBase *src, int srcOffset, ArrayBas _ASSERTE(count >= 0); // Copy the data. +#if defined(_WIN64) + JIT_MemCpy(dst->GetDataPtr() + dstOffset, src->GetDataPtr() + srcOffset, count); +#else memmove(dst->GetDataPtr() + dstOffset, src->GetDataPtr() + srcOffset, count); +#endif FC_GC_POLL(); } From 73bcd5a2e14d362e54096b1fa4558223566e474e Mon Sep 17 00:00:00 2001 From: helloguo Date: Thu, 15 Sep 2016 13:02:27 -0700 Subject: [PATCH 2/2] add header comments --- src/vm/amd64/CrtHelpers.asm | 58 +++++++++++++++++++++++++++++++++++-- src/vm/comutilnative.cpp | 10 +++---- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/src/vm/amd64/CrtHelpers.asm b/src/vm/amd64/CrtHelpers.asm index 9e883353dd7f..9d5b280558bc 100644 --- a/src/vm/amd64/CrtHelpers.asm +++ b/src/vm/amd64/CrtHelpers.asm @@ -15,6 +15,31 @@ include AsmMacros.inc ;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value" +; +;Purpose: +; Sets the first "count" bytes of the memory starting +; at "dst" to the character value "value". +; +;Algorithm: +;Set dst based on count as follow +; count [0, 16]: use 1/2/4/8 bytes width registers +; count [16, 128]: use 16 bytes width registers (XMM) without loop +; count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times +; count [512, upper]: use rep stosb +;Entry: +; char *dst - pointer to memory to fill with value +; char value - value to put in dst bytes +; int count - number of bytes of dst to fill +; +;Exit: +; returns dst, with filled bytes +; +;Uses: +; +;Exceptions: +; +;******************************************************************************* + LEAF_ENTRY JIT_MemSet, _TEXT movzx edx, dl ; set fill pattern @@ -123,9 +148,38 @@ mset08: ret LEAF_END_MARKED JIT_MemSet, _TEXT +;JIT_MemCpy - Copy source buffer to destination buffer +; +;Purpose: +; JIT_MemCpy() copies a source memory buffer to a destination memory +; buffer. This routine recognize overlapping buffers to avoid propogation. +; For cases where propogation is not a problem, memcpy() can be used. +; +;Algorithm: +;Copy to destination based on count as follow +; count [0, 64]: overlap check not needed +; count [0, 16]: use 1/2/4/8 bytes width registers +; count [16, 64]: use 16 bytes width registers (XMM) without loop +; count [64, upper]: check overlap +; non-overlap: +; count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times +; count [512, upper]: use rep movsb +; overlap:: +; use 16 bytes width registers (XMM) with loops to copy from end to beginnig +; +;Entry: +; void *dst = pointer to destination buffer +; const void *src = pointer to source buffer +; size_t count = number of bytes to copy +; +;Exit: +; Returns a pointer to the destination buffer +; +;Uses: +; +;Exceptions: +;******************************************************************************* -; from 0 to 64 bytes, no need to check overlap -; because we use xmm0 to xmm3 to store the src buffer LEAF_ENTRY JIT_MemCpy, _TEXT mov rax, rcx ; save dst address diff --git a/src/vm/comutilnative.cpp b/src/vm/comutilnative.cpp index d98acd5dcdce..41655cb5b01b 100644 --- a/src/vm/comutilnative.cpp +++ b/src/vm/comutilnative.cpp @@ -1478,11 +1478,11 @@ FCIMPL5(VOID, Buffer::BlockCopy, ArrayBase *src, int srcOffset, ArrayBase *dst, PTR_BYTE dstPtr = dst->GetDataPtr() + dstOffset; if ((srcPtr != dstPtr) && (count > 0)) { -#if defined(_WIN64) +#if defined(_AMD64_) && !defined(PLATFORM_UNIX) JIT_MemCpy(dstPtr, srcPtr, count); -#else +#else memmove(dstPtr, srcPtr, count); -#endif +#endif } FC_GC_POLL(); @@ -1528,9 +1528,9 @@ FCIMPL5(VOID, Buffer::InternalBlockCopy, ArrayBase *src, int srcOffset, ArrayBas _ASSERTE(count >= 0); // Copy the data. -#if defined(_WIN64) +#if defined(_AMD64_) && !defined(PLATFORM_UNIX) JIT_MemCpy(dst->GetDataPtr() + dstOffset, src->GetDataPtr() + srcOffset, count); -#else +#else memmove(dst->GetDataPtr() + dstOffset, src->GetDataPtr() + srcOffset, count); #endif