Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

[Arm64/Linux] Use platform memset/memcpy #17536

Merged
merged 1 commit into from
Apr 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/vm/amd64/crthelpers.S
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

// JIT_MemSet/JIT_MemCpy
//
// It is IMPORANT that the exception handling code is able to find these guys
// It is IMPORTANT that the exception handling code is able to find these guys
// on the stack, but on non-windows platforms we can just defer to the platform
// implementation.
//
Expand Down
344 changes: 15 additions & 329 deletions src/vm/arm64/crthelpers.S
Original file line number Diff line number Diff line change
Expand Up @@ -2,347 +2,33 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

// ==++==
//

#include "unixasmmacros.inc"

// JIT_MemSet/JIT_MemCpy
//
// ==--==

// Calls to JIT_MemSet is emitted by jit for initialization of large structs.
// We need to provide our own implementation of memset instead of using the ones in crt because crt implementation does not gurantee
// that aligned 8/4/2 - byte memory will be written atomically. This is required because members in a struct can be read atomically
// and their values should be written atomically.
//
//
//void JIT_MemSet(void *dst, int val, SIZE_T count)
//
// uint64_t valEx = (unsigned char)val;
// valEx = valEx | valEx << 8;
// valEx = valEx | valEx << 16;
// valEx = valEx | valEx << 32;
//
// size_t dc_zva_size = 4ULL << DCZID_EL0.BS;
//
// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment)
//
// // If not aligned then make it 8-byte aligned
// if(((uint64_t)dst&0xf) != 0)
// {
// // Calculate alignment we can do without exceeding count
// // Use math to avoid introducing more unpredictable branches
// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
//
// if(align&0x1)
// {
// *(unit8_t*)dst = (unit8_t)valEx;
// dst = (unit8_t*)dst + 1;
// count-=1;
// }
//
// if(align&0x2)
// {
// *(unit16_t*)dst = (unit16_t)valEx;
// dst = (unit16_t*)dst + 1;
// count-=2;
// }
//
// if(align&0x4)
// {
// *(unit32_t*)dst = (unit32_t)valEx;
// dst = (unit32_t*)dst + 1;
// count-=4;
// }
// }
//
// if(use_dc_zva)
// {
// // If not aligned then make it aligned to dc_zva_size
// if(dst&0x8)
// {
// *(uint64_t*)dst = (uint64_t)valEx;
// dst = (uint64_t*)dst + 1;
// count-=8;
// }
// It is IMPORTANT that the exception handling code is able to find these guys
// on the stack, but on non-windows platforms we can just defer to the platform
// implementation.
//
// while(dst & (dc_zva_size - 1))
// {
// *(uint64_t*)dst = valEx;
// dst = (uint64_t*)dst + 1;
// *(uint64_t*)dst = valEx;
// dst = (uint64_t*)dst + 1;
// count-=16;
// }
//
// count -= dc_zva_size;
//
// while(count >= 0)
// {
// dc_zva(dst);
// dst = (uint8_t*)dst + dc_zva_size;
// count-=dc_zva_size;
// }
//
// count += dc_zva_size;
// }
//
// count-=16;
//
// while(count >= 0)
// {
// *(uint64_t*)dst = valEx;
// dst = (uint64_t*)dst + 1;
// *(uint64_t*)dst = valEx;
// dst = (uint64_t*)dst + 1;
// count-=16;
// }
//
// if(count & 8)
// {
// *(uint64_t*)dst = valEx;
// dst = (uint64_t*)dst + 1;
// }
//
// if(count & 4)
// {
// *(uint32_t*)dst = (uint32_t)valEx;
// dst = (uint32_t*)dst + 1;
// }
//
// if(count & 2)
// {
// *(uint16_t*)dst = (uint16_t)valEx;
// dst = (uint16_t*)dst + 1;
// }
//
// if(count & 1)
// {
// *(uint8_t*)dst = (uint8_t)valEx;
// }
//
//

// Assembly code corresponding to above C++ method. JIT_MemSet can AV and clr exception personality routine needs to
// determine if the exception has taken place inside JIT_Memset in order to throw corresponding managed exception.
// Determining this is slow if the method were implemented as C++ method (using unwind info). In .asm file by adding JIT_MemSet_End
// marker it can be easily determined if exception happened in JIT_MemSet. Therefore, JIT_MemSet has been written in assembly instead of
// as C++ method.

LEAF_ENTRY JIT_MemSet, _TEXT
ands w8, w1, #0xff
mrs x3, DCZID_EL0 // x3 = DCZID_EL0
mov x6, #4
lsr x11, x2, #3 // x11 = count >> 3

orr w8, w8, w8, lsl #8
and x5, x3, #0xf // x5 = dczid_el0.bs
csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0
tst x3, (1 << 4)

orr w8, w8, w8, lsl #0x10
csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0
ands x3, x0, #7 // x3 = dst & 7
lsl x9, x6, x5 // x9 = size

orr x8, x8, x8, lsl #0x20
lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0
sub x10, x9, #1 // x10 = mask

b.eq LOCAL_LABEL(JIT_MemSet_0x80)

movn x4, #7
clz x5, x2
lsr x4, x4, x5
and x3, x3, x4
cbz x2, LOCAL_LABEL(JIT_MemSet_ret)

tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c)
strb w8, [x0], #1
sub x2, x2, #1
LOCAL_LABEL(JIT_MemSet_0x2c):
tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c)
strh w8, [x0], #2
sub x2, x2, #2
LOCAL_LABEL(JIT_MemSet_0x5c):
tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80)
str w8, [x0], #4
sub x2, x2, #4
LOCAL_LABEL(JIT_MemSet_0x80):
cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c)
tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84)
str x8, [x0], #8
sub x2, x2, #8
strb w1, [x0]

b LOCAL_LABEL(JIT_MemSet_0x85)
LOCAL_LABEL(JIT_MemSet_0x84):
stp x8, x8, [x0], #16
sub x2, x2, #16
LOCAL_LABEL(JIT_MemSet_0x85):
tst x0, x10
b.ne LOCAL_LABEL(JIT_MemSet_0x84)
b C_PLTFUNC(memset)

b LOCAL_LABEL(JIT_MemSet_0x8a)
LOCAL_LABEL(JIT_MemSet_0x88):
dc zva, x0
add x0, x0, x9
LOCAL_LABEL(JIT_MemSet_0x8a):
subs x2, x2, x9
b.ge LOCAL_LABEL(JIT_MemSet_0x88)

LOCAL_LABEL(JIT_MemSet_0x8c):
add x2, x2, x9

LOCAL_LABEL(JIT_MemSet_0x9c):
b LOCAL_LABEL(JIT_MemSet_0xa8)
LOCAL_LABEL(JIT_MemSet_0xa0):
stp x8, x8, [x0], #16
LOCAL_LABEL(JIT_MemSet_0xa8):
subs x2, x2, #16
b.ge LOCAL_LABEL(JIT_MemSet_0xa0)

LOCAL_LABEL(JIT_MemSet_0xb0):
tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4)
str x8, [x0], #8
LOCAL_LABEL(JIT_MemSet_0xb4):
tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8)
str w8, [x0], #4
LOCAL_LABEL(JIT_MemSet_0xc8):
tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc)
strh w8, [x0], #2
LOCAL_LABEL(JIT_MemSet_0xdc):
tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8)
strb w8, [x0]
LOCAL_LABEL(JIT_MemSet_0xe8):
LOCAL_LABEL(JIT_MemSet_ret):
ret lr
LEAF_END_MARKED JIT_MemSet, _TEXT

// See comments above for JIT_MemSet
LEAF_ENTRY JIT_MemCpy, _TEXT
cbz x2, LOCAL_LABEL(JIT_MemCpy_ret)

//void JIT_MemCpy(void *dst, const void *src, SIZE_T count)
//
// // If not aligned then make it 8-byte aligned
// if(((uintptr_t)dst&0x7) != 0)
// {
// // Calculate alignment we can do without exceeding count
// // Use math to avoid introducing more unpredictable branches
// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
//
// if(align&0x1)
// {
// *(unit8_t*)dst = *(unit8_t*)src;
// dst = (unit8_t*)dst + 1;
// src = (unit8_t*)src + 1;
// count-=1;
// }
//
// if(align&0x2)
// {
// *(unit16_t*)dst = *(unit16_t*)src;
// dst = (unit16_t*)dst + 1;
// src = (unit16_t*)src + 1;
// count-=2;
// }
//
// if(align&0x4)
// {
// *(unit32_t*)dst = *(unit32_t*)src;
// dst = (unit32_t*)dst + 1;
// src = (unit32_t*)src + 1;
// count-=4;
// }
// }
//
// count-=16;
//
// while(count >= 0)
// {
// *(unit64_t*)dst = *(unit64_t*)src;
// dst = (unit64_t*)dst + 1;
// src = (unit64_t*)src + 1;
// *(unit64_t*)dst = *(unit64_t*)src;
// dst = (unit64_t*)dst + 1;
// src = (unit64_t*)src + 1;
// count-=16;
// }
//
// if(count & 8)
// {
// *(unit64_t*)dst = *(unit64_t*)src;
// dst = (unit64_t*)dst + 1;
// src = (unit64_t*)src + 1;
// }
//
// if(count & 4)
// {
// *(unit32_t*)dst = *(unit32_t*)src;
// dst = (unit32_t*)dst + 1;
// src = (unit32_t*)src + 1;
// }
//
// if(count & 2)
// {
// *(unit16_t*)dst = *(unit16_t*)src;
// dst = (unit16_t*)dst + 1;
// src = (unit16_t*)src + 1;
// }
//
// if(count & 1)
// {
// *(unit8_t*)dst = *(unit8_t*)src;
// }
//
//
strb wzr, [x0]
ldrb wzr, [x1]

// Assembly code corresponding to above C++ method.
// See comments above for JIT_MemSet method
LEAF_ENTRY JIT_MemCpy, _TEXT
ands x3, x0, #7
movn x4, #7
clz x5, x2
b.eq LOCAL_LABEL(JIT_MemCpy_0xa8)
lsr x4, x4, x5
and x3, x3, x4
tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
ldrsb w8, [x1], #1
strb w8, [x0], #1
sub x2, x2, #1
LOCAL_LABEL(JIT_MemCpy_0x2c):
tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
ldrsh w8, [x1], #2
strh w8, [x0], #2
sub x2, x2, #2
LOCAL_LABEL(JIT_MemCpy_0x5c):
tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
ldr w8, [x1], #4
str w8, [x0], #4
sub x2, x2, #4
b LOCAL_LABEL(JIT_MemCpy_0xa8)
LOCAL_LABEL(JIT_MemCpy_0xa0):
ldp x8, x9, [x1], #16
stp x8, x9, [x0], #16
LOCAL_LABEL(JIT_MemCpy_0xa8):
subs x2, x2, #16
b.ge LOCAL_LABEL(JIT_MemCpy_0xa0)
LOCAL_LABEL(JIT_MemCpy_0xb0):
tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
ldr x8, [x1], #8
str x8, [x0], #8
LOCAL_LABEL(JIT_MemCpy_0xb4):
tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
ldr w8, [x1], #4
str w8, [x0], #4
LOCAL_LABEL(JIT_MemCpy_0xc8):
tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
ldrsh w8, [x1], #2
strh w8, [x0], #2
LOCAL_LABEL(JIT_MemCpy_0xdc):
tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
ldrsb w8, [x1]
strb w8, [x0]
LOCAL_LABEL(JIT_MemCpy_0xe8):
b C_PLTFUNC(memcpy)

LOCAL_LABEL(JIT_MemCpy_ret):
ret lr
LEAF_END_MARKED JIT_MemCpy, _TEXT