Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Fixing Buffer::BlockCopy, JIT_MemCpy, and JIT_MemSet to just call the appropriate CRT functions for x64 Windows, as is already done for all other platforms/targets #25750

Merged
merged 3 commits into from
Jul 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
334 changes: 47 additions & 287 deletions src/vm/amd64/CrtHelpers.asm
Original file line number Diff line number Diff line change
Expand Up @@ -14,317 +14,77 @@

include AsmMacros.inc

;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
extern memset:proc
extern memmove:proc

; JIT_MemSet/JIT_MemCpy
;
;Purpose:
; Sets the first "count" bytes of the memory starting
; at "dst" to the character value "value".
; It is IMPORTANT that the exception handling code is able to find these guys
; on the stack, but on windows platforms we can just defer to the platform
; implementation.
;
;Algorithm:
;Set dst based on count as follow
; count [0, 16]: use 1/2/4/8 bytes width registers
; count [16, 128]: use 16 bytes width registers (XMM) without loop
; count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times
; count [512, upper]: use rep stosb
;Entry:
; char *dst - pointer to memory to fill with value
; char value - value to put in dst bytes
; int count - number of bytes of dst to fill

; void JIT_MemSet(void* dest, int c, size_t count)
;
;Exit:
; returns dst, with filled bytes
; Purpose:
; Sets the first "count" bytes of the block of memory pointed byte
; "dest" to the specified value (interpreted as an unsigned char).
;
;Uses:
; Entry:
; RCX: void* dest - Pointer to the block of memory to fill.
; RDX: int c - Value to be set.
; R8: size_t count - Number of bytes to be set to the value.
;
;Exceptions:
; Exit:
;
; Uses:
;
; Exceptions:
;
;*******************************************************************************

LEAF_ENTRY JIT_MemSet, _TEXT
test r8, r8 ; check if count is zero
jz Exit_MemSet ; if zero, no bytes to set

movzx edx, dl ; set fill pattern
mov r9, 0101010101010101h
imul rdx, r9 ; rdx is 8 bytes filler
cmp byte ptr [rcx], 0 ; check dest for null

cmp r8, 16
jbe mset04
jmp memset ; forward to the CRT implementation

cmp r8, 512
jbe mset00

; count > 512
mov r10, rcx ; save dst address
mov r11, rdi ; save rdi
mov eax, edx ; eax is value
mov rdi, rcx ; rdi is dst
mov rcx, r8 ; rcx is count
rep stosb
mov rdi, r11 ; restore rdi
mov rax, r10
Exit_MemSet:
ret

align 16
mset00: mov rax, rcx ; save dst address
movd xmm0, rdx
punpcklbw xmm0, xmm0 ; xmm0 is 16 bytes filler

cmp r8, 128
jbe mset02

; count > 128 && count <= 512
mov r9, r8
shr r9, 7 ; count/128

align 16
mset01: movdqu [rcx], xmm0
movdqu 16[rcx], xmm0
movdqu 32[rcx], xmm0
movdqu 48[rcx], xmm0
movdqu 64[rcx], xmm0
movdqu 80[rcx], xmm0
movdqu 96[rcx], xmm0
movdqu 112[rcx], xmm0
add rcx, 128
dec r9
jnz mset01
and r8, 7fh ; and r8 with 0111 1111

; the remainder is from 0 to 127
cmp r8, 16
jnbe mset02

; the remainder <= 16
movdqu -16[rcx + r8], xmm0
ret

; count > 16 && count <= 128 for mset02
align 16
mset02: movdqu [rcx], xmm0
movdqu -16[rcx + r8], xmm0
cmp r8, 32
jbe mset03

; count > 32 && count <= 64
movdqu 16[rcx], xmm0
movdqu -32[rcx + r8], xmm0
cmp r8, 64
jbe mset03

; count > 64 && count <= 128
movdqu 32[rcx], xmm0
movdqu 48[rcx], xmm0
movdqu -48[rcx + r8], xmm0
movdqu -64[rcx + r8], xmm0
mset03: ret

align 16
mset04: mov rax, rcx ; save dst address
test r8b, 24 ; and r8b with 0001 1000
jz mset05

; count >= 8 && count <= 16
mov [rcx], rdx
mov -8[rcx + r8], rdx
ret

align 16
mset05: test r8b, 4 ; and r8b with 0100
jz mset06

; count >= 4 && count < 8
mov [rcx], edx
mov -4[rcx + r8], edx
ret

; count >= 0 && count < 4
align 16
mset06: test r8b, 1 ; and r8b with 0001
jz mset07
mov [rcx],dl
mset07: test r8b, 2 ; and r8b with 0010
jz mset08
mov -2[rcx + r8], dx
mset08: ret

LEAF_END_MARKED JIT_MemSet, _TEXT

;JIT_MemCpy - Copy source buffer to destination buffer
; void JIT_MemCpy(void* dest, const void* src, size_t count)
;
;Purpose:
; JIT_MemCpy() copies a source memory buffer to a destination memory
; buffer. This routine recognize overlapping buffers to avoid propogation.
; For cases where propogation is not a problem, memcpy() can be used.
; Purpose:
; Copies the values of "count" bytes from the location pointed to
; by "src" to the memory block pointed by "dest".
;
;Algorithm:
;Copy to destination based on count as follow
; count [0, 64]: overlap check not needed
; count [0, 16]: use 1/2/4/8 bytes width registers
; count [16, 64]: use 16 bytes width registers (XMM) without loop
; count [64, upper]: check overlap
; non-overlap:
; count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times
; count [512, upper]: use rep movsb
; overlap::
; use 16 bytes width registers (XMM) with loops to copy from end to beginnig
; Entry:
; RCX: void* dest - Pointer to the destination array where content is to be copied.
; RDX: const void* src - Pointer to the source of the data to be copied.
; R8: size_t count - Number of bytes to copy.
;
;Entry:
; void *dst = pointer to destination buffer
; const void *src = pointer to source buffer
; size_t count = number of bytes to copy
; Exit:
Copy link
Member Author

@tannergooding tannergooding Jul 17, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The declaration of these methods in jitinterface.h have them as void returning, rather than as void* returning (which is what the CRT implementations do):

void STDCALL JIT_MemSet(void *dest, int c, SIZE_T count);
void STDCALL JIT_MemCpy(void *dest, const void *src, SIZE_T count);

;
;Exit:
; Returns a pointer to the destination buffer
; Uses:
;
;Uses:
; Exceptions:
;
;Exceptions:
;*******************************************************************************

LEAF_ENTRY JIT_MemCpy, _TEXT
test r8, r8 ; check if count is zero
jz Exit_MemCpy ; if zero, no bytes to copy

mov rax, rcx ; save dst address
cmp r8, 16
jbe mcpy02

cmp r8, 64
jnbe mcpy07
cmp byte ptr [rcx], 0 ; check dest for null
cmp byte ptr [rdx], 0 ; check src for null

; count > 16 && count <= 64
align 16
mcpy00: movdqu xmm0, [rdx]
movdqu xmm1, -16[rdx + r8] ; save 16 to 32 bytes src
cmp r8, 32
jbe mcpy01

movdqu xmm2, 16[rdx]
movdqu xmm3, -32[rdx + r8] ; save 32 to 64 bytes src

;count > 32 && count <= 64
movdqu 16[rcx], xmm2
movdqu -32[rcx + r8], xmm3

;count > 16 && count <= 32
mcpy01: movdqu [rcx], xmm0
movdqu -16[rcx + r8], xmm1
ret

; count <= 16
align 16
mcpy02: test r8b, 24 ; test count with 0001 1000
jz mcpy03
; count >= 8 && count <= 16
mov r9, [rdx]
mov r10, -8[rdx + r8]
mov [rcx], r9
mov -8[rcx + r8], r10
ret

align 16
mcpy03: test r8b, 4 ; test count with 0100
jz mcpy04
; count >= 4 && count < 8
mov r9d, [rdx]
mov r10d, -4[rdx + r8]
mov [rcx], r9d
mov -4[rcx + r8], r10d
ret

; count >= 0 && count < 4
align 16
mcpy04: test r8, r8
jz mcpy06 ; count == 1/2/3
mov r9b, [rdx] ; save the first byte

test r8b, 2 ; test count with 0010
jz mcpy05
mov r10w, -2[rdx + r8]
mov -2[rcx + r8], r10w
mcpy05: mov [rcx], r9b
mcpy06: ret

align 16
; count > 64, we need to check overlap
mcpy07: mov r9, rdx ; r9 is src address
sub r9, rcx ; if src - dst < 0 jump to mcpy11
jb mcpy11 ; if b, destination may overlap

mcpy08: cmp r8, 512
jnbe mcpy10

; count > 64 && count <= 512
mov r9, r8
shr r9, 6 ; count/64

align 16
mcpy09: movdqu xmm0, [rdx]
movdqu xmm1, 16[rdx]
movdqu xmm2, 32[rdx]
movdqu xmm3, 48[rdx]
movdqu [rcx], xmm0
movdqu 16[rcx], xmm1
movdqu 32[rcx], xmm2
movdqu 48[rcx], xmm3
add rdx, 64
add rcx, 64
dec r9
jnz mcpy09

; the remainder is from 0 to 63
and r8, 3fh ; and with 0011 1111
cmp r8, 16
jnbe mcpy00
; Use memmove to handle overlapping buffers for better
; compatibility with .NET Framework. Needing to handle
; overlapping buffers in cpblk is undefined by the spec.
jmp memmove ; forward to the CRT implementation

; the remainder <= 16
jmp mcpy02
ret

; count > 512
align 16
mcpy10: mov r10, rdi ; save rdi
mov r11, rsi ; save rsi
mov rdi, rcx ; rdi is dst
mov rsi, rdx ; rsi is src
mov rcx, r8 ; rcx is count
rep movsb ; mov from rsi to rdi
mov rsi, r11 ; restore rsi
mov rdi, r10 ; restore rdi
Exit_MemCpy:
ret

; The source address is less than the destination address.

align 16
mcpy11: add r9, r8 ; src - dst + count
cmp r9, 0 ; src + count < = dst jump to mcpy08
jle mcpy08

lea r9, [rdx + r8] ; r9 is the src + count
lea r10, [rcx + r8] ; r10 is the dst + count

mov r11, r8
shr r11, 6 ; count/64

; count > 64
align 16
mcpy12: movdqu xmm0, -16[r9]
movdqu xmm1, -32[r9]
movdqu xmm2, -48[r9]
movdqu xmm3, -64[r9]
movdqu -16[r10], xmm0
movdqu -32[r10], xmm1
movdqu -48[r10], xmm2
movdqu -64[r10], xmm3
sub r9, 64
sub r10, 64
dec r11
jnz mcpy12

; the remainder is from 0 to 63
and r8, 3fh ; and with 0011 1111
cmp r8, 16
jnbe mcpy00

; the remainder <= 16
jmp mcpy02

LEAF_END_MARKED JIT_MemCpy, _TEXT
end
end
Loading