Skip to content

Commit af344b6

Browse files
Fixing Buffer::BlockCopy, JIT_MemCpy, and JIT_MemSet to just call the appropriate CRT functions for x64 Windows, as is already done for all other platforms/targets (dotnet/coreclr#25750)
* Fixing Buffer::BlockCopy to just call the CRT memmove for x64 Windows, as is already done for all other platforms/targets * Fixing up the x64 CrtHelpers.asm to just forward to the CRT implementations for JIT_MemSet and JIT_MemCpy * Keep unix using memcpy and clarify that Windows uses memmove for full framework compat. Commit migrated from dotnet/coreclr@54d9d9b
1 parent 9beaef9 commit af344b6

File tree

3 files changed

+90
-300
lines changed

3 files changed

+90
-300
lines changed

src/coreclr/src/vm/amd64/CrtHelpers.asm

+47-287
Original file line numberDiff line numberDiff line change
@@ -14,317 +14,77 @@
1414

1515
include AsmMacros.inc
1616

17-
;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value"
17+
extern memset:proc
18+
extern memmove:proc
19+
20+
; JIT_MemSet/JIT_MemCpy
1821
;
19-
;Purpose:
20-
; Sets the first "count" bytes of the memory starting
21-
; at "dst" to the character value "value".
22+
; It is IMPORTANT that the exception handling code is able to find these guys
23+
; on the stack, but on windows platforms we can just defer to the platform
24+
; implementation.
2225
;
23-
;Algorithm:
24-
;Set dst based on count as follow
25-
; count [0, 16]: use 1/2/4/8 bytes width registers
26-
; count [16, 128]: use 16 bytes width registers (XMM) without loop
27-
; count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times
28-
; count [512, upper]: use rep stosb
29-
;Entry:
30-
; char *dst - pointer to memory to fill with value
31-
; char value - value to put in dst bytes
32-
; int count - number of bytes of dst to fill
26+
27+
; void JIT_MemSet(void* dest, int c, size_t count)
3328
;
34-
;Exit:
35-
; returns dst, with filled bytes
29+
; Purpose:
30+
; Sets the first "count" bytes of the block of memory pointed byte
31+
; "dest" to the specified value (interpreted as an unsigned char).
3632
;
37-
;Uses:
33+
; Entry:
34+
; RCX: void* dest - Pointer to the block of memory to fill.
35+
; RDX: int c - Value to be set.
36+
; R8: size_t count - Number of bytes to be set to the value.
3837
;
39-
;Exceptions:
38+
; Exit:
39+
;
40+
; Uses:
41+
;
42+
; Exceptions:
4043
;
41-
;*******************************************************************************
42-
4344
LEAF_ENTRY JIT_MemSet, _TEXT
45+
test r8, r8 ; check if count is zero
46+
jz Exit_MemSet ; if zero, no bytes to set
4447

45-
movzx edx, dl ; set fill pattern
46-
mov r9, 0101010101010101h
47-
imul rdx, r9 ; rdx is 8 bytes filler
48+
cmp byte ptr [rcx], 0 ; check dest for null
4849

49-
cmp r8, 16
50-
jbe mset04
50+
jmp memset ; forward to the CRT implementation
5151

52-
cmp r8, 512
53-
jbe mset00
54-
55-
; count > 512
56-
mov r10, rcx ; save dst address
57-
mov r11, rdi ; save rdi
58-
mov eax, edx ; eax is value
59-
mov rdi, rcx ; rdi is dst
60-
mov rcx, r8 ; rcx is count
61-
rep stosb
62-
mov rdi, r11 ; restore rdi
63-
mov rax, r10
52+
Exit_MemSet:
6453
ret
6554

66-
align 16
67-
mset00: mov rax, rcx ; save dst address
68-
movd xmm0, rdx
69-
punpcklbw xmm0, xmm0 ; xmm0 is 16 bytes filler
70-
71-
cmp r8, 128
72-
jbe mset02
73-
74-
; count > 128 && count <= 512
75-
mov r9, r8
76-
shr r9, 7 ; count/128
77-
78-
align 16
79-
mset01: movdqu [rcx], xmm0
80-
movdqu 16[rcx], xmm0
81-
movdqu 32[rcx], xmm0
82-
movdqu 48[rcx], xmm0
83-
movdqu 64[rcx], xmm0
84-
movdqu 80[rcx], xmm0
85-
movdqu 96[rcx], xmm0
86-
movdqu 112[rcx], xmm0
87-
add rcx, 128
88-
dec r9
89-
jnz mset01
90-
and r8, 7fh ; and r8 with 0111 1111
91-
92-
; the remainder is from 0 to 127
93-
cmp r8, 16
94-
jnbe mset02
95-
96-
; the remainder <= 16
97-
movdqu -16[rcx + r8], xmm0
98-
ret
99-
100-
; count > 16 && count <= 128 for mset02
101-
align 16
102-
mset02: movdqu [rcx], xmm0
103-
movdqu -16[rcx + r8], xmm0
104-
cmp r8, 32
105-
jbe mset03
106-
107-
; count > 32 && count <= 64
108-
movdqu 16[rcx], xmm0
109-
movdqu -32[rcx + r8], xmm0
110-
cmp r8, 64
111-
jbe mset03
112-
113-
; count > 64 && count <= 128
114-
movdqu 32[rcx], xmm0
115-
movdqu 48[rcx], xmm0
116-
movdqu -48[rcx + r8], xmm0
117-
movdqu -64[rcx + r8], xmm0
118-
mset03: ret
119-
120-
align 16
121-
mset04: mov rax, rcx ; save dst address
122-
test r8b, 24 ; and r8b with 0001 1000
123-
jz mset05
124-
125-
; count >= 8 && count <= 16
126-
mov [rcx], rdx
127-
mov -8[rcx + r8], rdx
128-
ret
129-
130-
align 16
131-
mset05: test r8b, 4 ; and r8b with 0100
132-
jz mset06
133-
134-
; count >= 4 && count < 8
135-
mov [rcx], edx
136-
mov -4[rcx + r8], edx
137-
ret
138-
139-
; count >= 0 && count < 4
140-
align 16
141-
mset06: test r8b, 1 ; and r8b with 0001
142-
jz mset07
143-
mov [rcx],dl
144-
mset07: test r8b, 2 ; and r8b with 0010
145-
jz mset08
146-
mov -2[rcx + r8], dx
147-
mset08: ret
148-
14955
LEAF_END_MARKED JIT_MemSet, _TEXT
15056

151-
;JIT_MemCpy - Copy source buffer to destination buffer
57+
; void JIT_MemCpy(void* dest, const void* src, size_t count)
15258
;
153-
;Purpose:
154-
; JIT_MemCpy() copies a source memory buffer to a destination memory
155-
; buffer. This routine recognize overlapping buffers to avoid propogation.
156-
; For cases where propogation is not a problem, memcpy() can be used.
59+
; Purpose:
60+
; Copies the values of "count" bytes from the location pointed to
61+
; by "src" to the memory block pointed by "dest".
15762
;
158-
;Algorithm:
159-
;Copy to destination based on count as follow
160-
; count [0, 64]: overlap check not needed
161-
; count [0, 16]: use 1/2/4/8 bytes width registers
162-
; count [16, 64]: use 16 bytes width registers (XMM) without loop
163-
; count [64, upper]: check overlap
164-
; non-overlap:
165-
; count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times
166-
; count [512, upper]: use rep movsb
167-
; overlap::
168-
; use 16 bytes width registers (XMM) with loops to copy from end to beginnig
63+
; Entry:
64+
; RCX: void* dest - Pointer to the destination array where content is to be copied.
65+
; RDX: const void* src - Pointer to the source of the data to be copied.
66+
; R8: size_t count - Number of bytes to copy.
16967
;
170-
;Entry:
171-
; void *dst = pointer to destination buffer
172-
; const void *src = pointer to source buffer
173-
; size_t count = number of bytes to copy
68+
; Exit:
17469
;
175-
;Exit:
176-
; Returns a pointer to the destination buffer
70+
; Uses:
17771
;
178-
;Uses:
72+
; Exceptions:
17973
;
180-
;Exceptions:
181-
;*******************************************************************************
182-
18374
LEAF_ENTRY JIT_MemCpy, _TEXT
75+
test r8, r8 ; check if count is zero
76+
jz Exit_MemCpy ; if zero, no bytes to copy
18477

185-
mov rax, rcx ; save dst address
186-
cmp r8, 16
187-
jbe mcpy02
188-
189-
cmp r8, 64
190-
jnbe mcpy07
78+
cmp byte ptr [rcx], 0 ; check dest for null
79+
cmp byte ptr [rdx], 0 ; check src for null
19180

192-
; count > 16 && count <= 64
193-
align 16
194-
mcpy00: movdqu xmm0, [rdx]
195-
movdqu xmm1, -16[rdx + r8] ; save 16 to 32 bytes src
196-
cmp r8, 32
197-
jbe mcpy01
198-
199-
movdqu xmm2, 16[rdx]
200-
movdqu xmm3, -32[rdx + r8] ; save 32 to 64 bytes src
201-
202-
;count > 32 && count <= 64
203-
movdqu 16[rcx], xmm2
204-
movdqu -32[rcx + r8], xmm3
205-
206-
;count > 16 && count <= 32
207-
mcpy01: movdqu [rcx], xmm0
208-
movdqu -16[rcx + r8], xmm1
209-
ret
210-
211-
; count <= 16
212-
align 16
213-
mcpy02: test r8b, 24 ; test count with 0001 1000
214-
jz mcpy03
215-
; count >= 8 && count <= 16
216-
mov r9, [rdx]
217-
mov r10, -8[rdx + r8]
218-
mov [rcx], r9
219-
mov -8[rcx + r8], r10
220-
ret
221-
222-
align 16
223-
mcpy03: test r8b, 4 ; test count with 0100
224-
jz mcpy04
225-
; count >= 4 && count < 8
226-
mov r9d, [rdx]
227-
mov r10d, -4[rdx + r8]
228-
mov [rcx], r9d
229-
mov -4[rcx + r8], r10d
230-
ret
231-
232-
; count >= 0 && count < 4
233-
align 16
234-
mcpy04: test r8, r8
235-
jz mcpy06 ; count == 1/2/3
236-
mov r9b, [rdx] ; save the first byte
237-
238-
test r8b, 2 ; test count with 0010
239-
jz mcpy05
240-
mov r10w, -2[rdx + r8]
241-
mov -2[rcx + r8], r10w
242-
mcpy05: mov [rcx], r9b
243-
mcpy06: ret
244-
245-
align 16
246-
; count > 64, we need to check overlap
247-
mcpy07: mov r9, rdx ; r9 is src address
248-
sub r9, rcx ; if src - dst < 0 jump to mcpy11
249-
jb mcpy11 ; if b, destination may overlap
250-
251-
mcpy08: cmp r8, 512
252-
jnbe mcpy10
253-
254-
; count > 64 && count <= 512
255-
mov r9, r8
256-
shr r9, 6 ; count/64
257-
258-
align 16
259-
mcpy09: movdqu xmm0, [rdx]
260-
movdqu xmm1, 16[rdx]
261-
movdqu xmm2, 32[rdx]
262-
movdqu xmm3, 48[rdx]
263-
movdqu [rcx], xmm0
264-
movdqu 16[rcx], xmm1
265-
movdqu 32[rcx], xmm2
266-
movdqu 48[rcx], xmm3
267-
add rdx, 64
268-
add rcx, 64
269-
dec r9
270-
jnz mcpy09
271-
272-
; the remainder is from 0 to 63
273-
and r8, 3fh ; and with 0011 1111
274-
cmp r8, 16
275-
jnbe mcpy00
81+
; Use memmove to handle overlapping buffers for better
82+
; compatibility with .NET Framework. Needing to handle
83+
; overlapping buffers in cpblk is undefined by the spec.
84+
jmp memmove ; forward to the CRT implementation
27685

277-
; the remainder <= 16
278-
jmp mcpy02
279-
ret
280-
281-
; count > 512
282-
align 16
283-
mcpy10: mov r10, rdi ; save rdi
284-
mov r11, rsi ; save rsi
285-
mov rdi, rcx ; rdi is dst
286-
mov rsi, rdx ; rsi is src
287-
mov rcx, r8 ; rcx is count
288-
rep movsb ; mov from rsi to rdi
289-
mov rsi, r11 ; restore rsi
290-
mov rdi, r10 ; restore rdi
86+
Exit_MemCpy:
29187
ret
29288

293-
; The source address is less than the destination address.
294-
295-
align 16
296-
mcpy11: add r9, r8 ; src - dst + count
297-
cmp r9, 0 ; src + count < = dst jump to mcpy08
298-
jle mcpy08
299-
300-
lea r9, [rdx + r8] ; r9 is the src + count
301-
lea r10, [rcx + r8] ; r10 is the dst + count
302-
303-
mov r11, r8
304-
shr r11, 6 ; count/64
305-
306-
; count > 64
307-
align 16
308-
mcpy12: movdqu xmm0, -16[r9]
309-
movdqu xmm1, -32[r9]
310-
movdqu xmm2, -48[r9]
311-
movdqu xmm3, -64[r9]
312-
movdqu -16[r10], xmm0
313-
movdqu -32[r10], xmm1
314-
movdqu -48[r10], xmm2
315-
movdqu -64[r10], xmm3
316-
sub r9, 64
317-
sub r10, 64
318-
dec r11
319-
jnz mcpy12
320-
321-
; the remainder is from 0 to 63
322-
and r8, 3fh ; and with 0011 1111
323-
cmp r8, 16
324-
jnbe mcpy00
325-
326-
; the remainder <= 16
327-
jmp mcpy02
328-
32989
LEAF_END_MARKED JIT_MemCpy, _TEXT
330-
end
90+
end

0 commit comments

Comments
 (0)