Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit bbbb5c1

Browse files
committed
[Arm64/Linux] Use platform memset/memcpy
Fixes buggy memset implementation Use heavily optimized platform implementation Follows amd64 & arm precedent
1 parent c29b30b commit bbbb5c1

File tree

1 file changed

+15
-329
lines changed

1 file changed

+15
-329
lines changed

src/vm/arm64/crthelpers.S

+15-329
Original file line numberDiff line numberDiff line change
@@ -2,347 +2,33 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

5-
// ==++==
6-
//
7-
85
#include "unixasmmacros.inc"
96

7+
// JIT_MemSet/JIT_MemCpy
108
//
11-
// ==--==
12-
13-
// Calls to JIT_MemSet is emitted by jit for initialization of large structs.
14-
// We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee
15-
// that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically
16-
// and their values should be written atomically.
17-
//
18-
//
19-
//void JIT_MemSet(void *dst, int val, SIZE_T count)
20-
//
21-
// uint64_t valEx = (unsigned char)val;
22-
// valEx = valEx | valEx << 8;
23-
// valEx = valEx | valEx << 16;
24-
// valEx = valEx | valEx << 32;
25-
//
26-
// size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
27-
//
28-
// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
29-
//
30-
// // If not aligned then make it 8-byte aligned
31-
// if(((uint64_t)dst&0xf) != 0)
32-
// {
33-
// // Calculate alignment we can do without exceeding count
34-
// // Use math to avoid introducing more unpredictable branches
35-
// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
36-
// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
37-
// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
38-
//
39-
// if(align&0x1)
40-
// {
41-
// *(unit8_t*)dst = (unit8_t)valEx;
42-
// dst = (unit8_t*)dst + 1;
43-
// count-=1;
44-
// }
45-
//
46-
// if(align&0x2)
47-
// {
48-
// *(unit16_t*)dst = (unit16_t)valEx;
49-
// dst = (unit16_t*)dst + 1;
50-
// count-=2;
51-
// }
52-
//
53-
// if(align&0x4)
54-
// {
55-
// *(unit32_t*)dst = (unit32_t)valEx;
56-
// dst = (unit32_t*)dst + 1;
57-
// count-=4;
58-
// }
59-
// }
60-
//
61-
// if(use_dc_zva)
62-
// {
63-
// // If not aligned then make it aligned to dc_zva_size
64-
// if(dst&0x8)
65-
// {
66-
// *(uint64_t*)dst = (uint64_t)valEx;
67-
// dst = (uint64_t*)dst + 1;
68-
// count-=8;
69-
// }
9+
// It is IMPORANT that the exception handling code is able to find these guys
10+
// on the stack, but on non-windows platforms we can just defer to the platform
11+
// implementation.
7012
//
71-
// while(dst & (dc_zva_size - 1))
72-
// {
73-
// *(uint64_t*)dst = valEx;
74-
// dst = (uint64_t*)dst + 1;
75-
// *(uint64_t*)dst = valEx;
76-
// dst = (uint64_t*)dst + 1;
77-
// count-=16;
78-
// }
79-
//
80-
// count -= dc_zva_size;
81-
//
82-
// while(count >= 0)
83-
// {
84-
// dc_zva(dst);
85-
// dst = (uint8_t*)dst + dc_zva_size;
86-
// count-=dc_zva_size;
87-
// }
88-
//
89-
// count += dc_zva_size;
90-
// }
91-
//
92-
// count-=16;
93-
//
94-
// while(count >= 0)
95-
// {
96-
// *(uint64_t*)dst = valEx;
97-
// dst = (uint64_t*)dst + 1;
98-
// *(uint64_t*)dst = valEx;
99-
// dst = (uint64_t*)dst + 1;
100-
// count-=16;
101-
// }
102-
//
103-
// if(count & 8)
104-
// {
105-
// *(uint64_t*)dst = valEx;
106-
// dst = (uint64_t*)dst + 1;
107-
// }
108-
//
109-
// if(count & 4)
110-
// {
111-
// *(uint32_t*)dst = (uint32_t)valEx;
112-
// dst = (uint32_t*)dst + 1;
113-
// }
114-
//
115-
// if(count & 2)
116-
// {
117-
// *(uint16_t*)dst = (uint16_t)valEx;
118-
// dst = (uint16_t*)dst + 1;
119-
// }
120-
//
121-
// if(count & 1)
122-
// {
123-
// *(uint8_t*)dst = (uint8_t)valEx;
124-
// }
125-
//
126-
//
127-
128-
// Assembly code corresponding to above C++ method. JIT_MemSet can AV and clr exception personality routine needs to
129-
// determine if the exception has taken place inside JIT_Memset in order to throw corresponding managed exception.
130-
// Determining this is slow if the method were implemented as C++ method (using unwind info). In .asm file by adding JIT_MemSet_End
131-
// marker it can be easily determined if exception happened in JIT_MemSet. Therefore, JIT_MemSet has been written in assembly instead of
132-
// as C++ method.
133-
13413
LEAF_ENTRY JIT_MemSet, _TEXT
135-
ands w8, w1, #0xff
136-
mrs x3, DCZID_EL0 // x3 = DCZID_EL0
137-
mov x6, #4
138-
lsr x11, x2, #3 // x11 = count >> 3
139-
140-
orr w8, w8, w8, lsl #8
141-
and x5, x3, #0xf // x5 = dczid_el0.bs
142-
csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0
143-
tst x3, (1 << 4)
144-
145-
orr w8, w8, w8, lsl #0x10
146-
csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
147-
ands x3, x0, #7 // x3 = dst & 7
148-
lsl x9, x6, x5 // x9 = size
149-
150-
orr x8, x8, x8, lsl #0x20
151-
lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
152-
sub x10, x9, #1 // x10 = mask
153-
154-
b.eq LOCAL_LABEL(JIT_MemSet_0x80)
155-
156-
movn x4, #7
157-
clz x5, x2
158-
lsr x4, x4, x5
159-
and x3, x3, x4
14+
cbz x2, LOCAL_LABEL(JIT_MemSet_ret)
16015

161-
tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c)
162-
strb w8, [x0], #1
163-
sub x2, x2, #1
164-
LOCAL_LABEL(JIT_MemSet_0x2c):
165-
tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c)
166-
strh w8, [x0], #2
167-
sub x2, x2, #2
168-
LOCAL_LABEL(JIT_MemSet_0x5c):
169-
tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80)
170-
str w8, [x0], #4
171-
sub x2, x2, #4
172-
LOCAL_LABEL(JIT_MemSet_0x80):
173-
cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c)
174-
tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84)
175-
str x8, [x0], #8
176-
sub x2, x2, #8
16+
strb w1, [x0]
17717

178-
b LOCAL_LABEL(JIT_MemSet_0x85)
179-
LOCAL_LABEL(JIT_MemSet_0x84):
180-
stp x8, x8, [x0], #16
181-
sub x2, x2, #16
182-
LOCAL_LABEL(JIT_MemSet_0x85):
183-
tst x0, x10
184-
b.ne LOCAL_LABEL(JIT_MemSet_0x84)
18+
b C_PLTFUNC(memset)
18519

186-
b LOCAL_LABEL(JIT_MemSet_0x8a)
187-
LOCAL_LABEL(JIT_MemSet_0x88):
188-
dc zva, x0
189-
add x0, x0, x9
190-
LOCAL_LABEL(JIT_MemSet_0x8a):
191-
subs x2, x2, x9
192-
b.ge LOCAL_LABEL(JIT_MemSet_0x88)
193-
194-
LOCAL_LABEL(JIT_MemSet_0x8c):
195-
add x2, x2, x9
196-
197-
LOCAL_LABEL(JIT_MemSet_0x9c):
198-
b LOCAL_LABEL(JIT_MemSet_0xa8)
199-
LOCAL_LABEL(JIT_MemSet_0xa0):
200-
stp x8, x8, [x0], #16
201-
LOCAL_LABEL(JIT_MemSet_0xa8):
202-
subs x2, x2, #16
203-
b.ge LOCAL_LABEL(JIT_MemSet_0xa0)
204-
205-
LOCAL_LABEL(JIT_MemSet_0xb0):
206-
tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4)
207-
str x8, [x0], #8
208-
LOCAL_LABEL(JIT_MemSet_0xb4):
209-
tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8)
210-
str w8, [x0], #4
211-
LOCAL_LABEL(JIT_MemSet_0xc8):
212-
tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc)
213-
strh w8, [x0], #2
214-
LOCAL_LABEL(JIT_MemSet_0xdc):
215-
tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8)
216-
strb w8, [x0]
217-
LOCAL_LABEL(JIT_MemSet_0xe8):
20+
LOCAL_LABEL(JIT_MemSet_ret):
21821
ret lr
21922
LEAF_END_MARKED JIT_MemSet, _TEXT
22023

221-
// See comments above for JIT_MemSet
24+
LEAF_ENTRY JIT_MemCpy, _TEXT
25+
cbz x2, LOCAL_LABEL(JIT_MemCpy_ret)
22226

223-
//void JIT_MemCpy(void *dst, const void *src, SIZE_T count)
224-
//
225-
// // If not aligned then make it 8-byte aligned
226-
// if(((uintptr_t)dst&0x7) != 0)
227-
// {
228-
// // Calculate alignment we can do without exceeding count
229-
// // Use math to avoid introducing more unpredictable branches
230-
// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
231-
// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
232-
// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
233-
//
234-
// if(align&0x1)
235-
// {
236-
// *(unit8_t*)dst = *(unit8_t*)src;
237-
// dst = (unit8_t*)dst + 1;
238-
// src = (unit8_t*)src + 1;
239-
// count-=1;
240-
// }
241-
//
242-
// if(align&0x2)
243-
// {
244-
// *(unit16_t*)dst = *(unit16_t*)src;
245-
// dst = (unit16_t*)dst + 1;
246-
// src = (unit16_t*)src + 1;
247-
// count-=2;
248-
// }
249-
//
250-
// if(align&0x4)
251-
// {
252-
// *(unit32_t*)dst = *(unit32_t*)src;
253-
// dst = (unit32_t*)dst + 1;
254-
// src = (unit32_t*)src + 1;
255-
// count-=4;
256-
// }
257-
// }
258-
//
259-
// count-=16;
260-
//
261-
// while(count >= 0)
262-
// {
263-
// *(unit64_t*)dst = *(unit64_t*)src;
264-
// dst = (unit64_t*)dst + 1;
265-
// src = (unit64_t*)src + 1;
266-
// *(unit64_t*)dst = *(unit64_t*)src;
267-
// dst = (unit64_t*)dst + 1;
268-
// src = (unit64_t*)src + 1;
269-
// count-=16;
270-
// }
271-
//
272-
// if(count & 8)
273-
// {
274-
// *(unit64_t*)dst = *(unit64_t*)src;
275-
// dst = (unit64_t*)dst + 1;
276-
// src = (unit64_t*)src + 1;
277-
// }
278-
//
279-
// if(count & 4)
280-
// {
281-
// *(unit32_t*)dst = *(unit32_t*)src;
282-
// dst = (unit32_t*)dst + 1;
283-
// src = (unit32_t*)src + 1;
284-
// }
285-
//
286-
// if(count & 2)
287-
// {
288-
// *(unit16_t*)dst = *(unit16_t*)src;
289-
// dst = (unit16_t*)dst + 1;
290-
// src = (unit16_t*)src + 1;
291-
// }
292-
//
293-
// if(count & 1)
294-
// {
295-
// *(unit8_t*)dst = *(unit8_t*)src;
296-
// }
297-
//
298-
//
27+
strb wzr, [x0]
28+
ldrb wzr, [x1]
29929

300-
// Assembly code corresponding to above C++ method.
301-
// See comments above for JIT_MemSet method
302-
LEAF_ENTRY JIT_MemCpy, _TEXT
303-
ands x3, x0, #7
304-
movn x4, #7
305-
clz x5, x2
306-
b.eq LOCAL_LABEL(JIT_MemCpy_0xa8)
307-
lsr x4, x4, x5
308-
and x3, x3, x4
309-
tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
310-
ldrsb w8, [x1], #1
311-
strb w8, [x0], #1
312-
sub x2, x2, #1
313-
LOCAL_LABEL(JIT_MemCpy_0x2c):
314-
tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
315-
ldrsh w8, [x1], #2
316-
strh w8, [x0], #2
317-
sub x2, x2, #2
318-
LOCAL_LABEL(JIT_MemCpy_0x5c):
319-
tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
320-
ldr w8, [x1], #4
321-
str w8, [x0], #4
322-
sub x2, x2, #4
323-
b LOCAL_LABEL(JIT_MemCpy_0xa8)
324-
LOCAL_LABEL(JIT_MemCpy_0xa0):
325-
ldp x8, x9, [x1], #16
326-
stp x8, x9, [x0], #16
327-
LOCAL_LABEL(JIT_MemCpy_0xa8):
328-
subs x2, x2, #16
329-
b.ge LOCAL_LABEL(JIT_MemCpy_0xa0)
330-
LOCAL_LABEL(JIT_MemCpy_0xb0):
331-
tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
332-
ldr x8, [x1], #8
333-
str x8, [x0], #8
334-
LOCAL_LABEL(JIT_MemCpy_0xb4):
335-
tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
336-
ldr w8, [x1], #4
337-
str w8, [x0], #4
338-
LOCAL_LABEL(JIT_MemCpy_0xc8):
339-
tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
340-
ldrsh w8, [x1], #2
341-
strh w8, [x0], #2
342-
LOCAL_LABEL(JIT_MemCpy_0xdc):
343-
tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
344-
ldrsb w8, [x1]
345-
strb w8, [x0]
346-
LOCAL_LABEL(JIT_MemCpy_0xe8):
30+
b C_PLTFUNC(memcpy)
31+
32+
LOCAL_LABEL(JIT_MemCpy_ret):
34733
ret lr
34834
LEAF_END_MARKED JIT_MemCpy, _TEXT

0 commit comments

Comments
 (0)