Skip to content

Commit 9e6fa39

Browse files
authored
[BOLT][AArch64][instr] Consider targeting ARM64 CPUs without LSE support (#158738)
`stadd` is only available in recent arm64 CPUs that have LSE support (like Cortex-A73 and Cortex-A75) and is not available on old arm64 CPUs (like Cortex-A53 and Cortex-A55). Devices could have a mixture of these two kinds of CPUs, for which we need to provide an option for BOLT to generate instrumentation sequence that emulates what `stadd` would do. The implementation puts counter increment into an injected helper function so we don't need to update CFG in the function that is being instrumented and instrumentation induced binary size increase will be smaller.
1 parent 185ae5c commit 9e6fa39

File tree

6 files changed

+197
-20
lines changed

6 files changed

+197
-20
lines changed

bolt/include/bolt/Core/MCPlusBuilder.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class raw_ostream;
5151

5252
namespace bolt {
5353
class BinaryBasicBlock;
54+
class BinaryContext;
5455
class BinaryFunction;
5556

5657
/// Different types of indirect branches encountered during disassembly.
@@ -530,10 +531,15 @@ class MCPlusBuilder {
530531
return 0;
531532
}
532533

534+
/// Create a helper function to increment counter for Instrumentation
535+
virtual void createInstrCounterIncrFunc(BinaryContext &BC) {
536+
llvm_unreachable("not implemented");
537+
}
538+
533539
/// Create increment contents of target by 1 for Instrumentation
534-
virtual InstructionListType
535-
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
536-
unsigned CodePointerSize) const {
540+
virtual InstructionListType createInstrIncMemory(const MCSymbol *Target,
541+
MCContext *Ctx, bool IsLeaf,
542+
unsigned CodePointerSize) {
537543
llvm_unreachable("not implemented");
538544
return InstructionListType();
539545
}

bolt/lib/Passes/Instrumentation.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,8 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
753753
createSimpleFunction("__bolt_fini_trampoline",
754754
BC.MIB->createReturnInstructionList(BC.Ctx.get()));
755755
}
756+
if (BC.isAArch64())
757+
BC.MIB->createInstrCounterIncrFunc(BC);
756758
}
757759
}
758760

bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp

Lines changed: 130 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "llvm/MC/MCInstrInfo.h"
2727
#include "llvm/MC/MCRegister.h"
2828
#include "llvm/MC/MCRegisterInfo.h"
29+
#include "llvm/Support/CommandLine.h"
2930
#include "llvm/Support/DataExtractor.h"
3031
#include "llvm/Support/Debug.h"
3132
#include "llvm/Support/ErrorHandling.h"
@@ -35,6 +36,15 @@
3536
using namespace llvm;
3637
using namespace bolt;
3738

39+
namespace opts {
40+
extern cl::OptionCategory BoltInstrCategory;
41+
static cl::opt<bool> NoLSEAtomics(
42+
"no-lse-atomics",
43+
cl::desc("generate instrumentation code sequence without using LSE atomic "
44+
"instruction"),
45+
cl::init(false), cl::Optional, cl::cat(BoltInstrCategory));
46+
} // namespace opts
47+
3848
namespace {
3949

4050
static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) {
@@ -106,7 +116,7 @@ static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) {
106116
}
107117

108118
static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) {
109-
// NOTE: Supports only ARM with LSE extension
119+
assert(!opts::NoLSEAtomics && "Supports only ARM with LSE extension");
110120
Inst.setOpcode(AArch64::LDADDX);
111121
Inst.clear();
112122
Inst.addOperand(MCOperand::createReg(AArch64::XZR));
@@ -135,6 +145,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
135145
public:
136146
using MCPlusBuilder::MCPlusBuilder;
137147

148+
BinaryFunction *InstrCounterIncrFunc{nullptr};
149+
138150
std::unique_ptr<MCSymbolizer>
139151
createTargetSymbolizer(BinaryFunction &Function,
140152
bool CreateNewSymbols) const override {
@@ -2513,22 +2525,129 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
25132525
return Insts;
25142526
}
25152527

2516-
InstructionListType
2517-
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
2518-
unsigned CodePointerSize) const override {
2528+
// Instrumentation code sequence using LSE atomic instruction has a total of
2529+
// 6 instructions:
2530+
//
2531+
// stp x0, x1, [sp, #-0x10]!
2532+
// adrp x0, page_address(counter)
2533+
// add x0, x0, page_offset(counter)
2534+
// mov x1, #0x1
2535+
// stadd x1, [x0]
2536+
// ldp x0, x1, [sp], #0x10
2537+
//
2538+
// Instrumentation code sequence without using LSE atomic instruction has
2539+
// 8 instructions at instrumentation place, with 6 instructions in the helper:
2540+
//
2541+
// stp x0, x30, [sp, #-0x10]!
2542+
// stp x1, x2, [sp, #-0x10]!
2543+
// adrp x0, page_address(counter)
2544+
// add x0, x0, page_offset(counter)
2545+
// adrp x1, page_address(helper)
2546+
// add x1, x1, page_offset(helper)
2547+
// blr x1
2548+
// ldp x0, x30, [sp], #0x10
2549+
//
2550+
// <helper>:
2551+
// ldaxr x1, [x0]
2552+
// add x1, x1, #0x1
2553+
// stlxr w2, x1, [x0]
2554+
// cbnz w2, <helper>
2555+
// ldp x1, x2, [sp], #0x10
2556+
// ret
2557+
2558+
void createInstrCounterIncrFunc(BinaryContext &BC) override {
2559+
assert(InstrCounterIncrFunc == nullptr &&
2560+
"helper function of counter increment for instrumentation "
2561+
"has already been created");
2562+
2563+
if (!opts::NoLSEAtomics)
2564+
return;
2565+
2566+
MCContext *Ctx = BC.Ctx.get();
2567+
InstrCounterIncrFunc = BC.createInjectedBinaryFunction(
2568+
"__bolt_instr_counter_incr", /*IsSimple*/ false);
2569+
std::vector<std::unique_ptr<BinaryBasicBlock>> BBs;
2570+
2571+
BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock());
2572+
InstructionListType Instrs(4);
2573+
Instrs[0].setOpcode(AArch64::LDAXRX);
2574+
Instrs[0].clear();
2575+
Instrs[0].addOperand(MCOperand::createReg(AArch64::X1));
2576+
Instrs[0].addOperand(MCOperand::createReg(AArch64::X0));
2577+
Instrs[1].setOpcode(AArch64::ADDXri);
2578+
Instrs[1].clear();
2579+
Instrs[1].addOperand(MCOperand::createReg(AArch64::X1));
2580+
Instrs[1].addOperand(MCOperand::createReg(AArch64::X1));
2581+
Instrs[1].addOperand(MCOperand::createImm(1));
2582+
Instrs[1].addOperand(MCOperand::createImm(0));
2583+
Instrs[2].setOpcode(AArch64::STLXRX);
2584+
Instrs[2].clear();
2585+
Instrs[2].addOperand(MCOperand::createReg(AArch64::W2));
2586+
Instrs[2].addOperand(MCOperand::createReg(AArch64::X1));
2587+
Instrs[2].addOperand(MCOperand::createReg(AArch64::X0));
2588+
Instrs[3].setOpcode(AArch64::CBNZW);
2589+
Instrs[3].clear();
2590+
Instrs[3].addOperand(MCOperand::createReg(AArch64::W2));
2591+
Instrs[3].addOperand(MCOperand::createExpr(
2592+
MCSymbolRefExpr::create(BBs.back()->getLabel(), *Ctx)));
2593+
BBs.back()->addInstructions(Instrs.begin(), Instrs.end());
2594+
BBs.back()->setCFIState(0);
2595+
2596+
BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock());
2597+
InstructionListType InstrsEpilog(2);
2598+
createPopRegisters(InstrsEpilog[0], AArch64::X1, AArch64::X2);
2599+
createReturn(InstrsEpilog[1]);
2600+
BBs.back()->addInstructions(InstrsEpilog.begin(), InstrsEpilog.end());
2601+
BBs.back()->setCFIState(0);
2602+
2603+
BBs[0]->addSuccessor(BBs[0].get());
2604+
BBs[0]->addSuccessor(BBs[1].get());
2605+
2606+
InstrCounterIncrFunc->insertBasicBlocks(nullptr, std::move(BBs),
2607+
/*UpdateLayout*/ true,
2608+
/*UpdateCFIState*/ false);
2609+
InstrCounterIncrFunc->updateState(BinaryFunction::State::CFG_Finalized);
2610+
2611+
LLVM_DEBUG({
2612+
dbgs() << "BOLT-DEBUG: instrumentation counter increment helper:\n";
2613+
InstrCounterIncrFunc->dump();
2614+
});
2615+
}
2616+
2617+
InstructionListType createInstrIncMemory(const MCSymbol *Target,
2618+
MCContext *Ctx, bool IsLeaf,
2619+
unsigned CodePointerSize) override {
25192620
unsigned int I = 0;
2520-
InstructionListType Instrs(6);
2621+
InstructionListType Instrs(opts::NoLSEAtomics ? 8 : 6);
2622+
2623+
if (opts::NoLSEAtomics) {
2624+
createPushRegisters(Instrs[I++], AArch64::X0, AArch64::LR);
2625+
createPushRegisters(Instrs[I++], AArch64::X1, AArch64::X2);
2626+
} else {
2627+
createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
2628+
}
25212629

2522-
createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
25232630
InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0);
25242631
assert(Addr.size() == 2 && "Invalid Addr size");
25252632
std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I);
25262633
I += Addr.size();
2527-
InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1);
2528-
assert(Insts.size() == 2 && "Invalid Insts size");
2529-
std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
2530-
I += Insts.size();
2531-
createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
2634+
2635+
if (opts::NoLSEAtomics) {
2636+
const MCSymbol *Helper = InstrCounterIncrFunc->getSymbol();
2637+
InstructionListType HelperAddr =
2638+
materializeAddress(Helper, Ctx, AArch64::X1);
2639+
assert(HelperAddr.size() == 2 && "Invalid HelperAddr size");
2640+
std::copy(HelperAddr.begin(), HelperAddr.end(), Instrs.begin() + I);
2641+
I += HelperAddr.size();
2642+
createIndirectCallInst(Instrs[I++], /*IsTailCall*/ false, AArch64::X1);
2643+
} else {
2644+
InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1);
2645+
assert(Insts.size() == 2 && "Invalid Insts size");
2646+
std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
2647+
I += Insts.size();
2648+
}
2649+
createPopRegisters(Instrs[I++], AArch64::X0,
2650+
opts::NoLSEAtomics ? AArch64::LR : AArch64::X1);
25322651
return Instrs;
25332652
}
25342653

bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -626,9 +626,9 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
626626
return Insts;
627627
}
628628

629-
InstructionListType
630-
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
631-
unsigned CodePointerSize) const override {
629+
InstructionListType createInstrIncMemory(const MCSymbol *Target,
630+
MCContext *Ctx, bool IsLeaf,
631+
unsigned CodePointerSize) override {
632632
// We need 2 scratch registers: one for the target address (x10), and one
633633
// for the increment value (x11).
634634
// addi sp, sp, -16

bolt/lib/Target/X86/X86MCPlusBuilder.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3053,9 +3053,9 @@ class X86MCPlusBuilder : public MCPlusBuilder {
30533053
Inst.clear();
30543054
}
30553055

3056-
InstructionListType
3057-
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
3058-
unsigned CodePointerSize) const override {
3056+
InstructionListType createInstrIncMemory(const MCSymbol *Target,
3057+
MCContext *Ctx, bool IsLeaf,
3058+
unsigned CodePointerSize) override {
30593059
InstructionListType Instrs(IsLeaf ? 13 : 11);
30603060
unsigned int I = 0;
30613061

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# This test is to validate instrumentation code sequence generated with
2+
# and without `--no-lse-atomics`.
3+
4+
# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
5+
6+
# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo
7+
8+
.text
9+
.global _foo
10+
.type _foo, %function
11+
_foo:
12+
ret
13+
14+
.global _start
15+
.type _start, %function
16+
_start:
17+
ret
18+
19+
# Dummy relocation to force relocation mode
20+
.reloc 0, R_AARCH64_NONE
21+
22+
# RUN: llvm-bolt %t.so -o %t.instr.so --instrument
23+
# RUN: llvm-objdump -d %t.instr.so | FileCheck %s --check-prefix=INLINE
24+
# INLINE: {{.*}} <_foo>:
25+
# INLINE-NEXT: {{.*}} stp x0, x1, [sp, #-0x10]!
26+
# INLINE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}}
27+
# INLINE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}}
28+
# INLINE-NEXT: {{.*}} mov x1, #0x1
29+
# INLINE-NEXT: {{.*}} stadd x1, [x0]
30+
# INLINE-NEXT: {{.*}} ldp x0, x1, [sp], #0x10
31+
32+
# RUN: llvm-bolt %t.so -o %t.instr.no_lse.so --instrument \
33+
# RUN: --no-lse-atomics
34+
# RUN: llvm-objdump -d %t.instr.no_lse.so | FileCheck %s --check-prefix=NOLSE
35+
# NOLSE: {{.*}} <_foo>:
36+
# NOLSE-NEXT: {{.*}} stp x0, x30, [sp, #-0x10]!
37+
# NOLSE-NEXT: {{.*}} stp x1, x2, [sp, #-0x10]!
38+
# NOLSE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}}
39+
# NOLSE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}}
40+
# NOLSE-NEXT: {{.*}} adrp x1, 0x[[PAGEBASE:[0-9a-f]*]]000 {{.*}}
41+
# NOLSE-NEXT: {{.*}} add x1, x1, #0x[[PAGEOFF:[0-9a-f]*]]
42+
# NOLSE-NEXT: {{.*}} blr x1
43+
# NOLSE-NEXT: {{.*}} ldp x0, x30, [sp], #0x10
44+
# NOLSE: {{[0]*}}[[PAGEBASE]][[PAGEOFF]] <__bolt_instr_counter_incr>:
45+
# NOLSE-NEXT: {{.*}} ldaxr x1, [x0]
46+
# NOLSE-NEXT: {{.*}} add x1, x1, #0x1
47+
# NOLSE-NEXT: {{.*}} stlxr w2, x1, [x0]
48+
# NOLSE-NEXT: {{.*}} cbnz w2, 0x{{[0-9[a-f]*}} <__bolt_instr_counter_incr>
49+
# NOLSE-NEXT: {{.*}} ldp x1, x2, [sp], #0x10
50+
# NOLSE-NEXT: {{.*}} ret

0 commit comments

Comments
 (0)