|
2 | 2 | // The .NET Foundation licenses this file to you under the MIT license.
|
3 | 3 | // See the LICENSE file in the project root for more information.
|
4 | 4 |
|
5 |
| -// ==++== |
6 |
| -// |
7 |
| - |
8 | 5 | #include "unixasmmacros.inc"
|
9 | 6 |
|
| 7 | +// JIT_MemSet/JIT_MemCpy |
10 | 8 | //
|
11 |
| -// ==--== |
12 |
| - |
13 |
| -// Calls to JIT_MemSet is emitted by jit for initialization of large structs. |
14 |
| -// We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee |
15 |
| -// that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically |
16 |
| -// and their values should be written atomically. |
17 |
| -// |
18 |
| -// |
19 |
| -//void JIT_MemSet(void *dst, int val, SIZE_T count) |
20 |
| -// |
21 |
| -// uint64_t valEx = (unsigned char)val; |
22 |
| -// valEx = valEx | valEx << 8; |
23 |
| -// valEx = valEx | valEx << 16; |
24 |
| -// valEx = valEx | valEx << 32; |
25 |
| -// |
26 |
| -// size_t dc_zva_size = 4ULL << DCZID_EL0.BS; |
27 |
| -// |
28 |
| -// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment) |
29 |
| -// |
30 |
| -// // If not aligned then make it 8-byte aligned |
31 |
| -// if(((uint64_t)dst&0xf) != 0) |
32 |
| -// { |
33 |
| -// // Calculate alignment we can do without exceeding count |
34 |
| -// // Use math to avoid introducing more unpredictable branches |
35 |
| -// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 |
36 |
| -// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 |
37 |
| -// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) |
38 |
| -// |
39 |
| -// if(align&0x1) |
40 |
| -// { |
41 |
| -// *(unit8_t*)dst = (unit8_t)valEx; |
42 |
| -// dst = (unit8_t*)dst + 1; |
43 |
| -// count-=1; |
44 |
| -// } |
45 |
| -// |
46 |
| -// if(align&0x2) |
47 |
| -// { |
48 |
| -// *(unit16_t*)dst = (unit16_t)valEx; |
49 |
| -// dst = (unit16_t*)dst + 1; |
50 |
| -// count-=2; |
51 |
| -// } |
52 |
| -// |
53 |
| -// if(align&0x4) |
54 |
| -// { |
55 |
| -// *(unit32_t*)dst = (unit32_t)valEx; |
56 |
| -// dst = (unit32_t*)dst + 1; |
57 |
| -// count-=4; |
58 |
| -// } |
59 |
| -// } |
60 |
| -// |
61 |
| -// if(use_dc_zva) |
62 |
| -// { |
63 |
| -// // If not aligned then make it aligned to dc_zva_size |
64 |
| -// if(dst&0x8) |
65 |
| -// { |
66 |
| -// *(uint64_t*)dst = (uint64_t)valEx; |
67 |
| -// dst = (uint64_t*)dst + 1; |
68 |
| -// count-=8; |
69 |
| -// } |
| 9 | +// It is IMPORANT that the exception handling code is able to find these guys |
| 10 | +// on the stack, but on non-windows platforms we can just defer to the platform |
| 11 | +// implementation. |
70 | 12 | //
|
71 |
| -// while(dst & (dc_zva_size - 1)) |
72 |
| -// { |
73 |
| -// *(uint64_t*)dst = valEx; |
74 |
| -// dst = (uint64_t*)dst + 1; |
75 |
| -// *(uint64_t*)dst = valEx; |
76 |
| -// dst = (uint64_t*)dst + 1; |
77 |
| -// count-=16; |
78 |
| -// } |
79 |
| -// |
80 |
| -// count -= dc_zva_size; |
81 |
| -// |
82 |
| -// while(count >= 0) |
83 |
| -// { |
84 |
| -// dc_zva(dst); |
85 |
| -// dst = (uint8_t*)dst + dc_zva_size; |
86 |
| -// count-=dc_zva_size; |
87 |
| -// } |
88 |
| -// |
89 |
| -// count += dc_zva_size; |
90 |
| -// } |
91 |
| -// |
92 |
| -// count-=16; |
93 |
| -// |
94 |
| -// while(count >= 0) |
95 |
| -// { |
96 |
| -// *(uint64_t*)dst = valEx; |
97 |
| -// dst = (uint64_t*)dst + 1; |
98 |
| -// *(uint64_t*)dst = valEx; |
99 |
| -// dst = (uint64_t*)dst + 1; |
100 |
| -// count-=16; |
101 |
| -// } |
102 |
| -// |
103 |
| -// if(count & 8) |
104 |
| -// { |
105 |
| -// *(uint64_t*)dst = valEx; |
106 |
| -// dst = (uint64_t*)dst + 1; |
107 |
| -// } |
108 |
| -// |
109 |
| -// if(count & 4) |
110 |
| -// { |
111 |
| -// *(uint32_t*)dst = (uint32_t)valEx; |
112 |
| -// dst = (uint32_t*)dst + 1; |
113 |
| -// } |
114 |
| -// |
115 |
| -// if(count & 2) |
116 |
| -// { |
117 |
| -// *(uint16_t*)dst = (uint16_t)valEx; |
118 |
| -// dst = (uint16_t*)dst + 1; |
119 |
| -// } |
120 |
| -// |
121 |
| -// if(count & 1) |
122 |
| -// { |
123 |
| -// *(uint8_t*)dst = (uint8_t)valEx; |
124 |
| -// } |
125 |
| -// |
126 |
| -// |
127 |
| - |
128 |
| -// Assembly code corresponding to above C++ method. JIT_MemSet can AV and clr exception personality routine needs to |
129 |
| -// determine if the exception has taken place inside JIT_Memset in order to throw corresponding managed exception. |
130 |
| -// Determining this is slow if the method were implemented as C++ method (using unwind info). In .asm file by adding JIT_MemSet_End |
131 |
| -// marker it can be easily determined if exception happened in JIT_MemSet. Therefore, JIT_MemSet has been written in assembly instead of |
132 |
| -// as C++ method. |
133 |
| - |
134 | 13 | LEAF_ENTRY JIT_MemSet, _TEXT
|
135 |
| - ands w8, w1, #0xff |
136 |
| - mrs x3, DCZID_EL0 // x3 = DCZID_EL0 |
137 |
| - mov x6, #4 |
138 |
| - lsr x11, x2, #3 // x11 = count >> 3 |
139 |
| - |
140 |
| - orr w8, w8, w8, lsl #8 |
141 |
| - and x5, x3, #0xf // x5 = dczid_el0.bs |
142 |
| - csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0 |
143 |
| - tst x3, (1 << 4) |
144 |
| - |
145 |
| - orr w8, w8, w8, lsl #0x10 |
146 |
| - csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0 |
147 |
| - ands x3, x0, #7 // x3 = dst & 7 |
148 |
| - lsl x9, x6, x5 // x9 = size |
149 |
| - |
150 |
| - orr x8, x8, x8, lsl #0x20 |
151 |
| - lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0 |
152 |
| - sub x10, x9, #1 // x10 = mask |
153 |
| - |
154 |
| - b.eq LOCAL_LABEL(JIT_MemSet_0x80) |
155 |
| - |
156 |
| - movn x4, #7 |
157 |
| - clz x5, x2 |
158 |
| - lsr x4, x4, x5 |
159 |
| - and x3, x3, x4 |
| 14 | + cbz x2, LOCAL_LABEL(JIT_MemSet_ret) |
160 | 15 |
|
161 |
| - tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c) |
162 |
| - strb w8, [x0], #1 |
163 |
| - sub x2, x2, #1 |
164 |
| -LOCAL_LABEL(JIT_MemSet_0x2c): |
165 |
| - tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c) |
166 |
| - strh w8, [x0], #2 |
167 |
| - sub x2, x2, #2 |
168 |
| -LOCAL_LABEL(JIT_MemSet_0x5c): |
169 |
| - tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80) |
170 |
| - str w8, [x0], #4 |
171 |
| - sub x2, x2, #4 |
172 |
| -LOCAL_LABEL(JIT_MemSet_0x80): |
173 |
| - cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c) |
174 |
| - tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84) |
175 |
| - str x8, [x0], #8 |
176 |
| - sub x2, x2, #8 |
| 16 | + strb w1, [x0] |
177 | 17 |
|
178 |
| - b LOCAL_LABEL(JIT_MemSet_0x85) |
179 |
| -LOCAL_LABEL(JIT_MemSet_0x84): |
180 |
| - stp x8, x8, [x0], #16 |
181 |
| - sub x2, x2, #16 |
182 |
| -LOCAL_LABEL(JIT_MemSet_0x85): |
183 |
| - tst x0, x10 |
184 |
| - b.ne LOCAL_LABEL(JIT_MemSet_0x84) |
| 18 | + b C_PLTFUNC(memset) |
185 | 19 |
|
186 |
| - b LOCAL_LABEL(JIT_MemSet_0x8a) |
187 |
| -LOCAL_LABEL(JIT_MemSet_0x88): |
188 |
| - dc zva, x0 |
189 |
| - add x0, x0, x9 |
190 |
| -LOCAL_LABEL(JIT_MemSet_0x8a): |
191 |
| - subs x2, x2, x9 |
192 |
| - b.ge LOCAL_LABEL(JIT_MemSet_0x88) |
193 |
| - |
194 |
| -LOCAL_LABEL(JIT_MemSet_0x8c): |
195 |
| - add x2, x2, x9 |
196 |
| - |
197 |
| -LOCAL_LABEL(JIT_MemSet_0x9c): |
198 |
| - b LOCAL_LABEL(JIT_MemSet_0xa8) |
199 |
| -LOCAL_LABEL(JIT_MemSet_0xa0): |
200 |
| - stp x8, x8, [x0], #16 |
201 |
| -LOCAL_LABEL(JIT_MemSet_0xa8): |
202 |
| - subs x2, x2, #16 |
203 |
| - b.ge LOCAL_LABEL(JIT_MemSet_0xa0) |
204 |
| - |
205 |
| -LOCAL_LABEL(JIT_MemSet_0xb0): |
206 |
| - tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4) |
207 |
| - str x8, [x0], #8 |
208 |
| -LOCAL_LABEL(JIT_MemSet_0xb4): |
209 |
| - tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8) |
210 |
| - str w8, [x0], #4 |
211 |
| -LOCAL_LABEL(JIT_MemSet_0xc8): |
212 |
| - tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc) |
213 |
| - strh w8, [x0], #2 |
214 |
| -LOCAL_LABEL(JIT_MemSet_0xdc): |
215 |
| - tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8) |
216 |
| - strb w8, [x0] |
217 |
| -LOCAL_LABEL(JIT_MemSet_0xe8): |
| 20 | +LOCAL_LABEL(JIT_MemSet_ret): |
218 | 21 | ret lr
|
219 | 22 | LEAF_END_MARKED JIT_MemSet, _TEXT
|
220 | 23 |
|
221 |
| -// See comments above for JIT_MemSet |
| 24 | +LEAF_ENTRY JIT_MemCpy, _TEXT |
| 25 | + cbz x2, LOCAL_LABEL(JIT_MemCpy_ret) |
222 | 26 |
|
223 |
| -//void JIT_MemCpy(void *dst, const void *src, SIZE_T count) |
224 |
| -// |
225 |
| -// // If not aligned then make it 8-byte aligned |
226 |
| -// if(((uintptr_t)dst&0x7) != 0) |
227 |
| -// { |
228 |
| -// // Calculate alignment we can do without exceeding count |
229 |
| -// // Use math to avoid introducing more unpredictable branches |
230 |
| -// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 |
231 |
| -// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 |
232 |
| -// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) |
233 |
| -// |
234 |
| -// if(align&0x1) |
235 |
| -// { |
236 |
| -// *(unit8_t*)dst = *(unit8_t*)src; |
237 |
| -// dst = (unit8_t*)dst + 1; |
238 |
| -// src = (unit8_t*)src + 1; |
239 |
| -// count-=1; |
240 |
| -// } |
241 |
| -// |
242 |
| -// if(align&0x2) |
243 |
| -// { |
244 |
| -// *(unit16_t*)dst = *(unit16_t*)src; |
245 |
| -// dst = (unit16_t*)dst + 1; |
246 |
| -// src = (unit16_t*)src + 1; |
247 |
| -// count-=2; |
248 |
| -// } |
249 |
| -// |
250 |
| -// if(align&0x4) |
251 |
| -// { |
252 |
| -// *(unit32_t*)dst = *(unit32_t*)src; |
253 |
| -// dst = (unit32_t*)dst + 1; |
254 |
| -// src = (unit32_t*)src + 1; |
255 |
| -// count-=4; |
256 |
| -// } |
257 |
| -// } |
258 |
| -// |
259 |
| -// count-=16; |
260 |
| -// |
261 |
| -// while(count >= 0) |
262 |
| -// { |
263 |
| -// *(unit64_t*)dst = *(unit64_t*)src; |
264 |
| -// dst = (unit64_t*)dst + 1; |
265 |
| -// src = (unit64_t*)src + 1; |
266 |
| -// *(unit64_t*)dst = *(unit64_t*)src; |
267 |
| -// dst = (unit64_t*)dst + 1; |
268 |
| -// src = (unit64_t*)src + 1; |
269 |
| -// count-=16; |
270 |
| -// } |
271 |
| -// |
272 |
| -// if(count & 8) |
273 |
| -// { |
274 |
| -// *(unit64_t*)dst = *(unit64_t*)src; |
275 |
| -// dst = (unit64_t*)dst + 1; |
276 |
| -// src = (unit64_t*)src + 1; |
277 |
| -// } |
278 |
| -// |
279 |
| -// if(count & 4) |
280 |
| -// { |
281 |
| -// *(unit32_t*)dst = *(unit32_t*)src; |
282 |
| -// dst = (unit32_t*)dst + 1; |
283 |
| -// src = (unit32_t*)src + 1; |
284 |
| -// } |
285 |
| -// |
286 |
| -// if(count & 2) |
287 |
| -// { |
288 |
| -// *(unit16_t*)dst = *(unit16_t*)src; |
289 |
| -// dst = (unit16_t*)dst + 1; |
290 |
| -// src = (unit16_t*)src + 1; |
291 |
| -// } |
292 |
| -// |
293 |
| -// if(count & 1) |
294 |
| -// { |
295 |
| -// *(unit8_t*)dst = *(unit8_t*)src; |
296 |
| -// } |
297 |
| -// |
298 |
| -// |
| 27 | + strb wzr, [x0] |
| 28 | + ldrb wzr, [x1] |
299 | 29 |
|
300 |
| -// Assembly code corresponding to above C++ method. |
301 |
| -// See comments above for JIT_MemSet method |
302 |
| -LEAF_ENTRY JIT_MemCpy, _TEXT |
303 |
| - ands x3, x0, #7 |
304 |
| - movn x4, #7 |
305 |
| - clz x5, x2 |
306 |
| - b.eq LOCAL_LABEL(JIT_MemCpy_0xa8) |
307 |
| - lsr x4, x4, x5 |
308 |
| - and x3, x3, x4 |
309 |
| - tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c) |
310 |
| - ldrsb w8, [x1], #1 |
311 |
| - strb w8, [x0], #1 |
312 |
| - sub x2, x2, #1 |
313 |
| -LOCAL_LABEL(JIT_MemCpy_0x2c): |
314 |
| - tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c) |
315 |
| - ldrsh w8, [x1], #2 |
316 |
| - strh w8, [x0], #2 |
317 |
| - sub x2, x2, #2 |
318 |
| -LOCAL_LABEL(JIT_MemCpy_0x5c): |
319 |
| - tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8) |
320 |
| - ldr w8, [x1], #4 |
321 |
| - str w8, [x0], #4 |
322 |
| - sub x2, x2, #4 |
323 |
| - b LOCAL_LABEL(JIT_MemCpy_0xa8) |
324 |
| -LOCAL_LABEL(JIT_MemCpy_0xa0): |
325 |
| - ldp x8, x9, [x1], #16 |
326 |
| - stp x8, x9, [x0], #16 |
327 |
| -LOCAL_LABEL(JIT_MemCpy_0xa8): |
328 |
| - subs x2, x2, #16 |
329 |
| - b.ge LOCAL_LABEL(JIT_MemCpy_0xa0) |
330 |
| -LOCAL_LABEL(JIT_MemCpy_0xb0): |
331 |
| - tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4) |
332 |
| - ldr x8, [x1], #8 |
333 |
| - str x8, [x0], #8 |
334 |
| -LOCAL_LABEL(JIT_MemCpy_0xb4): |
335 |
| - tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8) |
336 |
| - ldr w8, [x1], #4 |
337 |
| - str w8, [x0], #4 |
338 |
| -LOCAL_LABEL(JIT_MemCpy_0xc8): |
339 |
| - tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc) |
340 |
| - ldrsh w8, [x1], #2 |
341 |
| - strh w8, [x0], #2 |
342 |
| -LOCAL_LABEL(JIT_MemCpy_0xdc): |
343 |
| - tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8) |
344 |
| - ldrsb w8, [x1] |
345 |
| - strb w8, [x0] |
346 |
| -LOCAL_LABEL(JIT_MemCpy_0xe8): |
| 30 | + b C_PLTFUNC(memcpy) |
| 31 | + |
| 32 | +LOCAL_LABEL(JIT_MemCpy_ret): |
347 | 33 | ret lr
|
348 | 34 | LEAF_END_MARKED JIT_MemCpy, _TEXT
|
0 commit comments