diff --git a/tcmalloc/internal/BUILD b/tcmalloc/internal/BUILD index e6de3821c..cc03c8166 100644 --- a/tcmalloc/internal/BUILD +++ b/tcmalloc/internal/BUILD @@ -246,6 +246,7 @@ cc_library( textual_hdrs = [ "percpu_rseq_ppc.S", "percpu_rseq_x86_64.S", + "percpu_rseq_aarch64.S", ], visibility = [ "//tcmalloc:__subpackages__", diff --git a/tcmalloc/internal/linux_syscall_support.h b/tcmalloc/internal/linux_syscall_support.h index 2411de918..0abf54ff1 100644 --- a/tcmalloc/internal/linux_syscall_support.h +++ b/tcmalloc/internal/linux_syscall_support.h @@ -56,7 +56,7 @@ static_assert(sizeof(kernel_rseq_cs) == (4 * sizeof(unsigned long long)), #if defined(__x86_64__) #define __NR_rseq 334 #elif defined(__aarch64__) -#define __NR_rseq 398 +#define __NR_rseq 293 #elif defined(__PPC__) #define __NR_rseq 387 #endif diff --git a/tcmalloc/internal/percpu.h b/tcmalloc/internal/percpu.h index 76882a487..079ff433a 100644 --- a/tcmalloc/internal/percpu.h +++ b/tcmalloc/internal/percpu.h @@ -19,7 +19,8 @@ // PERCPU_RSEQ_SUPPORTED_PLATFORM defines whether or not we have an // implementation for the target OS and architecture. -#if defined(__linux__) && (defined(__x86_64__) || defined(__PPC64__)) +#if defined(__linux__) \ + && (defined(__x86_64__) || defined(__PPC64__) || defined (__aarch64__)) #define PERCPU_RSEQ_SUPPORTED_PLATFORM 1 #else #define PERCPU_RSEQ_SUPPORTED_PLATFORM 0 @@ -31,6 +32,8 @@ #define PERCPU_RSEQ_SIGNATURE 0x53053053 #elif defined(__ppc__) #define PERCPU_RSEQ_SIGNATURE 0x0FE5000B +#elif defined(__aarch64__) +#define PERCPU_RSEQ_SIGNATURE 0xd428bc00 #else // Rather than error, allow us to build, but with an invalid signature. #define PERCPU_RSEQ_SIGNATURE 0x0 diff --git a/tcmalloc/internal/percpu_rseq_aarch64.S b/tcmalloc/internal/percpu_rseq_aarch64.S new file mode 100644 index 000000000..5dd2ea81f --- /dev/null +++ b/tcmalloc/internal/percpu_rseq_aarch64.S @@ -0,0 +1,487 @@ +/* + * Copyright 2020 The TCMalloc Authors + * + * Licensed under the Apache License, Version 2.0 (the "License") + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __aarch64__ +#error "percpu_rseq_aarch64.S should only be included for AArch64 builds" +#endif // __aarch64__ + +#include "tcmalloc/internal/percpu.h" + +/* + * API Exposition: + * + * METHOD_abort: // Emitted as part of START_RSEQ() + * START_RSEQ() // Starts critical section between [start,commit) + * METHOD_start: // Emitted as part of START_RSEQ() + * FETCH_CPU() // Reads current CPU + * ... + * single store // Commits sequence + * METHOD_commit: + * ...return... + * + * This process is assisted by the DEFINE_UPSTREAM_CS macro, which encodes a + * (rodata) constant table, whose address is used to start the critical + * section, and the abort trampoline. + * + * The trampoline is used because: + * 1. Restarts are expected to be rare, so the extra jump when restarting is + * expected to be infrequent. + * 2. The upstream restartable sequence implementation expects the trailing 4 + * bytes of the abort PC to be "signed" (to prevent manipulation of the PC + * to an arbitrary choice). For us, this is PERCPU_RSEQ_SIGNATURE. This + * value is passed to the kernel during configuration of the rseq syscall. + * This would either need to be encoded as a nop (SIGN_ABORT) at the start + * of every restartable sequence, increasing instruction cache pressure, or + * placed directly before the entry point. + * + * The trampoline returns us to METHOD_abort, which is the normal entry point + * for the restartable sequence. Upon restart, the (upstream) kernel API + * clears the per-thread restartable sequence state. We return to METHOD_abort + * (rather than METHOD_start), as we need to reinitialize this value. + */ + +/* Place the code into the google_malloc section. This section is the heaviest + * user of Rseq code, so it makes sense to co-locate it. + */ + +.section google_malloc, "ax" + +/* ---------------- start helper macros ---------------- */ + +// This macro defines a relocation associated with the provided label to keep +// section GC from discarding it independently of label. +#if !defined(__clang_major__) || __clang_major__ >= 9 +#define PINSECTION(label) .reloc 0, R_AARCH64_NONE, label +#else +#define PINSECTION(label) +#endif + +// This macro defines: +// * the rseq_cs instance that we'll use for label's critical section. +// * a trampoline to return to when we abort. This label_trampoline is +// distinct from label_start, as the return IP must be "signed" (see +// SIGN_ABORT()). +// +// TODO(b/141629158): __rseq_cs only needs to be writeable to allow for +// relocations, but could be read-only for non-PIE builds. +#define DEFINE_UPSTREAM_CS(label) \ + .pushsection __rseq_cs, "aw"; \ + .balign 32; \ + .protected __rseq_cs_##label; \ + .type __rseq_cs_##label,@object; \ + .size __rseq_cs_##label,32; \ + __rseq_cs_##label: \ + .long PERCPU_RSEQ_VERSION, PERCPU_RSEQ_FLAGS; \ + .quad .L##label##_start; \ + .quad .L##label##_commit - .L##label##_start; \ + .quad label##_trampoline; \ + PINSECTION(.L##label##array); \ + .popsection; \ + .pushsection __rseq_cs_ptr_array, "aw"; \ + .L##label##array: \ + .quad __rseq_cs_##label; \ + .popsection; \ + .pushsection rseq_trampoline, "ax"; \ + SIGN_ABORT(); \ + .globl label##_trampoline; \ + .type label##_trampoline, @function; \ +label##_trampoline: \ + .cfi_startproc; \ + b .L##label##_abort; \ + .cfi_endproc; \ + .size label##_trampoline, . - label##_trampoline; \ + .popsection; + +// This is part of the upstream rseq ABI. The 4 bytes prior to the abort IP +// must match PERCPU_RSEQ_SIGNATURE (as configured by our rseq syscall's +// signature parameter). This signature is used to annotate valid abort IPs +// (since rseq_cs could live in a user-writable segment). +// We use .inst here instead of a data directive so it works for both small and +// big endian. +#define SIGN_ABORT() \ + .inst PERCPU_RSEQ_SIGNATURE + +/* + * Provide a directive to specify the size of symbol "label", relative to the + * current location and its start. + */ +#define ENCODE_SIZE(label) .size label, . - label +/* We are assuming small memory model. */ +#if !defined(__AARCH64_CMODEL_SMALL__) +#error "Memory model not supported!" +#endif + +/* FETCH_CPU assumes &__rseq_abi is in x5. */ +#define FETCH_CPU(dest) \ + ldr dest, [x5, #4] /* cpuid is 32-bits */ + +/* With PIE have initial-exec TLS, even in the presence of position + independent code. */ +#if !defined(__PIC__) || defined(__PIE__) + +#define START_RSEQ(src) \ + .L##src##_abort: \ + mrs x5, tpidr_el0; \ + adrp x6, :gottprel:__rseq_abi; \ + ldr x6, [x6,:gottprel_lo12:__rseq_abi]; \ + add x5, x5, x6; \ + adrp x6, __rseq_cs_##src; \ + add x6, x6, :lo12:__rseq_cs_##src; \ + str x6, [x5, #8]; \ + .L##src##_start: + +#else /* !defined(__PIC__) || defined(__PIE__) */ + +/* + * In the case where we can't guarantee we have initial-exec TLS we use + * the General Dynamic TLS model to get the address of __rseq_abi. + * The call can be optimized away by the linker, but since we can not guarantee + * it will we must save and restore the registers used to store the arguments + * of our functions. The function with most arguments has 5 arguments, so we + * save x0-x4 and lr. + */ +#define START_RSEQ(src) \ + .L##src##_abort: \ + mov x5, lr; \ + stp x0, x1, [sp, -48]!; \ + stp x2, x3, [sp, #16]; \ + stp x4, x5, [sp, #32]; \ + adrp x0, :tlsdesc:__rseq_abi; \ + ldr x1, [x0, :tlsdesc_lo12:__rseq_abi]; \ + add x0, x0, :tlsdesc_lo12:__rseq_abi; \ + .tlsdesccall __rseq_abi; \ + blr x1; \ + ldp x4, x5, [sp, #32]; \ + mov lr, x5; \ + mrs x5, tpidr_el0; \ + add x5, x5, x0; \ + ldp x2, x3, [sp, #16]; \ + ldp x0, x1, [sp], #48; \ + adrp x6, __rseq_cs_##src; \ + add x6, x6, :lo12:__rseq_cs_##src; \ + str x6, [x5, #8]; \ + .L##src##_start: + +#endif +/* ---------------- end helper macros ---------------- */ + +/* start of atomic restartable sequences */ + +/* + * int TcmallocSlab_PerCpuCmpxchg64(int target_cpu, long *p, + * long old_val, long new_val) + * w0: target_cpu + * x1: p + * x2: old_val + * x3: new_val + */ + .p2align 6 /* aligns to 2^6 with NOP filling */ + .globl TcmallocSlab_PerCpuCmpxchg64 + .type TcmallocSlab_PerCpuCmpxchg64, @function +TcmallocSlab_PerCpuCmpxchg64: + .cfi_startproc +.LTcmallocSlab_PerCpuCmpxchg64_region0: + START_RSEQ(TcmallocSlab_PerCpuCmpxchg64) + FETCH_CPU(w4) + cmp w0, w4 /* check cpu vs current_cpu */ + bne .LTcmallocSlab_PerCpuCmpxchg64_region1 + ldr x6, [x1] + cmp x6, x2 /* verify *p == old */ + bne .LTcmallocSlab_PerCpuCmpxchg64_region2 + str x3, [x1] +.LTcmallocSlab_PerCpuCmpxchg64_region1: +.LTcmallocSlab_PerCpuCmpxchg64_commit: + mov x0, x4 + ret /* return current cpu, indicating mismatch OR success */ +.LTcmallocSlab_PerCpuCmpxchg64_region2: + mov x0, #-1 /* mismatch versus "old" or "check", return -1 */ + ret +.LTcmallocSlab_PerCpuCmpxchg64_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_PerCpuCmpxchg64) +DEFINE_UPSTREAM_CS(TcmallocSlab_PerCpuCmpxchg64) + +/* size_t TcmallocSlab_PushBatch_FixedShift( + * void *ptr (x0), + * size_t cl (w1), + * void** batch (x2), + * size_t len (w3) { + * uint64_t x8 = __rseq_abi.cpu_id + * uint64_t* x8 = CpuMemoryStart(x0, r8) + * Header* hdr = x8 + w1 * 8 + * uint64_t x9 = hdr->current (zero-extend 16bit) + * uint64_t x10 = hdr->end (zero-extend 16bit) + * if (w9 >= w10) return 0 + * x11 = x3 + * w10 = w9 + min(w3, w10 - w9) + * loop: + * x11-- + * x5 = *(batch + x11 * 8) + * *(x8 + x9 * 8) = x5 + * x9++ + * if (x9 != x10) goto loop + * hdr->current = r9 (16bit store) + * return len - r11 + * } + */ + .p2align 6 /* aligns to 2^6 with NOP filling */ + .globl TcmallocSlab_PushBatch_FixedShift + .type TcmallocSlab_PushBatch_FixedShift, @function +TcmallocSlab_PushBatch_FixedShift: + .cfi_startproc +.LTcmallocSlab_PushBatch_FixedShift_region0: + START_RSEQ(TcmallocSlab_PushBatch_FixedShift) + FETCH_CPU(w8) + lsl x8, x8, #PERCPU_TCMALLOC_FIXED_SLAB_SHIFT /* multiply cpu by 256k */ + add x8, x0, x8 + add x4, x8, x1, LSL #3 + ldrh w9, [x4] /* current */ + ldrh w10, [x4, #6] /* end */ + cmp w9, w10 + bge .LTcmallocSlab_PushBatch_FixedShift_region2 + mov x11, x3 /* r11 = copy of len */ + sub w10, w10, w9 /* r10 = free capacity */ + cmp w11, w10 + csel w10, w11, w10, ls /* r10 = min(len, free capacity) */ + add x10, x10, x9 +.LTcmallocSlab_PushBatch_FixedShift_loop: + sub x11 ,x11, #1 + ldr x12, [x2, x11, LSL #3] + str x12, [x8, x9, LSL #3] + add x9, x9, #1 + cmp x9, x10 + bne .LTcmallocSlab_PushBatch_FixedShift_loop + strh w9, [x4] +.LTcmallocSlab_PushBatch_FixedShift_region1: +.LTcmallocSlab_PushBatch_FixedShift_commit: + sub x0, x3, x11 + ret +.LTcmallocSlab_PushBatch_FixedShift_region2: + mov x0, #0 + ret +.LTcmallocSlab_PushBatch_FixedShift_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_PushBatch_FixedShift) +DEFINE_UPSTREAM_CS(TcmallocSlab_PushBatch_FixedShift) + +/* size_t TcmallocSlab_PopBatch_FixedShift( + * void *ptr (x0), + * size_t cl (w1), + * void** batch (x2), + * size_t len (w3) { + * uint64_t r8 = __rseq_abi.cpu_id + * uint64_t* r8 = CpuMemoryStart(rdi, r8) + * Header* hdr = GetHeader(rdi, rax, cl) + * uint64_t r9 = hdr->current + * uint64_t r10 = hdr->begin + * if (r9 <= r10) return 0 + * r11 = min(rcx, r9 - r10) + * rax = 0 + * loop: + * r9-- + * r10 = *(r8 + r9 * 8) + * batch[rax] = r10 + * rax++ + * if (rax != r11) goto loop + * hdr->current = r9 + * return rax + * } + */ + .p2align 6 /* aligns to 2^6 with NOP filling */ + .globl TcmallocSlab_PopBatch_FixedShift + .type TcmallocSlab_PopBatch_FixedShift, @function +TcmallocSlab_PopBatch_FixedShift: + .cfi_startproc +.LTcmallocSlab_PopBatch_FixedShift_region0: + START_RSEQ(TcmallocSlab_PopBatch_FixedShift) + FETCH_CPU(w8) + lsl x8, x8, #PERCPU_TCMALLOC_FIXED_SLAB_SHIFT /* multiply cpu by 256k */ + add x8, x0, x8 + add x4, x8, x1, LSL #3 + ldrh w9, [x4] /* current */ + ldrh w10, [x4, #4] /* begin */ + cmp w10, w9 + bhs .LTcmallocSlab_PopBatch_FixedShift_region2 + sub w11, w9, w10 /* w11 = available items */ + cmp w3, w11 + csel w11, w3, w11, ls /* r11 = min(len, available items) */ + mov x12, #0 +.LTcmallocSlab_PopBatch_FixedShift_loop: + sub x9, x9, #1 + ldr x10, [x8, x9, LSL #3] + str x10, [x2, x12, LSL #3] + add x12, x12, 1 + cmp x12, x11 + bne .LTcmallocSlab_PopBatch_FixedShift_loop + strh w9, [x4] +.LTcmallocSlab_PopBatch_FixedShift_region1: +.LTcmallocSlab_PopBatch_FixedShift_commit: + mov x0, x12 + ret +.LTcmallocSlab_PopBatch_FixedShift_region2: + mov x0, #0 + ret +.LTcmallocSlab_PopBatch_FixedShift_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_PopBatch_FixedShift) +DEFINE_UPSTREAM_CS(TcmallocSlab_PopBatch_FixedShift) + + .globl TcmallocSlab_Push + .type TcmallocSlab_Push, @function +TcmallocSlab_Push: +.LTcmallocSlab_Push_entry: + .cfi_startproc + // Arguments use: + // * x0: (Argument: Slabs*) cpu_0_slab_ptr + // * x1: (Argument: uintptr_t) cl + // * x2: (Argument: uintptr_t) p + // * w3: (Argument: size_t) shift + // * x4: (Argument: uintptr_t) f + // Return value: current CPU + // Available x5-x15 + + START_RSEQ(TcmallocSlab_Push) + FETCH_CPU(w8) + lsl x9, x8, x3 + add x9, x0, x9 + add x10, x9, x1, LSL #3 + ldrh w12, [x10] /* current */ + ldrh w11, [x10, #6] /* end */ + cmp w11, w12 + ble .LTcmallocSlab_Push_no_capacity + str x2, [x9, x12, LSL #3] + add w12, w12, #1 + strh w12, [x10] +.LTcmallocSlab_Push_commit: + mov x0, x8 + ret +.LTcmallocSlab_Push_no_capacity: + mov x0, x8 + br x4 +.LTcmallocSlab_Push_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_Push) +DEFINE_UPSTREAM_CS(TcmallocSlab_Push) + + + .globl TcmallocSlab_Push_FixedShift + .type TcmallocSlab_Push_FixedShift, @function +TcmallocSlab_Push_FixedShift: +.LTcmallocSlab_Push_FixedShift_entry: + .cfi_startproc + // Arguments use: + // * x0: (Argument: Slabs*) cpu_0_slab_ptr + // * x1: (Argument: uintptr_t) cl + // * x2: (Argument: uintptr_t) p + // * x3: (Argument: uintptr_t) f + // Return value: current CPU + // Available x4-x15 + + START_RSEQ(TcmallocSlab_Push_FixedShift) + FETCH_CPU(w8) + lsl x9, x8, #PERCPU_TCMALLOC_FIXED_SLAB_SHIFT + add x9, x0, x9 + add x10, x9, x1, LSL #3 + ldrh w12, [x10] /* current */ + ldrh w11, [x10, #6] /* end */ + cmp w11, w12 + ble .LTcmallocSlab_Push_FixedShift_no_capacity + str x2, [x9, x12, LSL #3] + add w12, w12, #1 + strh w12, [x10] +.LTcmallocSlab_Push_FixedShift_commit: + mov x0, x8 + ret +.LTcmallocSlab_Push_FixedShift_no_capacity: + mov x0, x8 + br x3 +.LTcmallocSlab_Push_FixedShift_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_Push_FixedShift) +DEFINE_UPSTREAM_CS(TcmallocSlab_Push_FixedShift) + + .globl TcmallocSlab_Pop_FixedShift + .type TcmallocSlab_Pop_FixedShift, @function +TcmallocSlab_Pop_FixedShift: +.LTcmallocSlab_Pop_FixedShift_entry: + .cfi_startproc + // Arguments use: + // * x0: (Argument: Slabs*) cpu_0_slab_ptr + // * x1: (Argument: uintptr_t) cl + // * x2: (Argument: uintptr_t) f + // Return value: current CPU + // Available x3-x15 + + START_RSEQ(TcmallocSlab_Pop_FixedShift) + FETCH_CPU(w8) //x8 = CPU + lsl x9, x8, #PERCPU_TCMALLOC_FIXED_SLAB_SHIFT //x9 = CPU shifted + add x9, x0, x9 //x9 = start of CPU region + add x10, x9, x1, LSL #3 //x10 = start of slab header + ldrh w12, [x10] //x12 = current index + ldrh w11, [x10, #4] //x11 = begin index + cmp w11, w12 // if begin >= current + bge .LTcmallocSlab_Pop_FixedShift_no_items + sub w12, w12, #1 //x12 = current-- + ldr x3, [x9, x12, LSL #3] //x3 = [start + index * 8] + strh w12, [x10] // store new current index +.LTcmallocSlab_Pop_FixedShift_commit: + mov x0, x3 // return popped item + ret +.LTcmallocSlab_Pop_FixedShift_no_items: + mov x0, x8 // call overflow handler with CPU ID + br x2 +.LTcmallocSlab_Pop_FixedShift_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_Pop_FixedShift) +DEFINE_UPSTREAM_CS(TcmallocSlab_Pop_FixedShift) + + .globl TcmallocSlab_Pop + .type TcmallocSlab_Pop, @function +TcmallocSlab_Pop: +.LTcmallocSlab_Pop_entry: + .cfi_startproc + // Arguments use: + // * x0: (Argument: Slabs*) cpu_0_slab_ptr + // * x1: (Argument: uintptr_t) cl + // * x2: (Argument: uintptr_t) f + // * w3: (Argument: size_t) shift + // Return value: Value + // Available x4-x15 + + START_RSEQ(TcmallocSlab_Pop) + FETCH_CPU(w8) //x8 = CPU ID + lsl x9, x8, x3 //x9 = CPU shifted by (x3) + add x9, x0, x9 //x9 = start of this CPU region + add x10, x9, x1, LSL #3 //x10 = slab header addr + ldrh w12, [x10] //x12 = current index + ldrh w11, [x10, #4] //x11 = begin index + cmp w11, w12 //if begin >= current + bge .LTcmallocSlab_Pop_no_items + sub w12, w12, #1 //x12 = current-- + ldr x4, [x9, x12, LSL #3] //x4 = [start + index * 8] + strh w12, [x10] //update current index +.LTcmallocSlab_Pop_commit: + mov x0, x4 // return popped item + ret +.LTcmallocSlab_Pop_no_items: + mov x0, x8 // call overflow handler with CPU ID + br x2 +.LTcmallocSlab_Pop_region3: + .cfi_endproc +ENCODE_SIZE(TcmallocSlab_Pop) +DEFINE_UPSTREAM_CS(TcmallocSlab_Pop) + +.section .note.GNU-stack,"",@progbits diff --git a/tcmalloc/internal/percpu_rseq_asm.S b/tcmalloc/internal/percpu_rseq_asm.S index 2de53725d..24ce87000 100644 --- a/tcmalloc/internal/percpu_rseq_asm.S +++ b/tcmalloc/internal/percpu_rseq_asm.S @@ -21,6 +21,8 @@ #include "tcmalloc/internal/percpu_rseq_x86_64.S" #elif defined(__ppc__) #include "tcmalloc/internal/percpu_rseq_ppc.S" +#elif defined(__aarch64__) +#include "tcmalloc/internal/percpu_rseq_aarch64.S" #else #error "RSEQ support expected, but not found." #endif