From 1ff2eb1d866a7b50e1d98af2737555e9d8b7e223 Mon Sep 17 00:00:00 2001 From: Dougall Johnson Date: Thu, 5 Oct 2023 18:53:40 +1100 Subject: [PATCH] [Aarch64] Materialize immediates with 64-bit ORR + EOR if shorter A number of useful constants can be encoded with a 64-bit ORR followed by a 64-bit EOR, including all remaining repeated byte patterns, some useful repeated 16-bit patterns, and some irregular masks. This patch prioritizes that encoding over three or four instruction encodings. Encoding with MOV + MOVK or ORR + MOVK is still preferred for fast literal generation and readability respectively. The method devises three candidate values, and checks if both Candidate and (Imm ^ Candidate) are valid logical immediates. If so, Imm is materialized with: ``` ORR Xd, XZR, #(Imm ^ Candidate) EOR Xd, Xd, #(Candidate) ``` The method has been exhaustively tested to ensure it can solve all possible values (excluding 0, ~0, and plain logical immediates, which are handled earlier). --- llvm/lib/Target/AArch64/AArch64ExpandImm.cpp | 103 +++++++++++++++ .../AArch64/AArch64ExpandPseudoInsts.cpp | 1 + llvm/test/CodeGen/AArch64/arm64-movi.ll | 117 ++++++++++++++++++ 3 files changed, 221 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp index 731972a039ba44..a7d72b59b1d5a6 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -362,6 +362,105 @@ static bool tryAndOfLogicalImmediates(uint64_t UImm, return false; } +// Check whether the constant can be represented by exclusive-or of two 64-bit +// logical immediates. If so, materialize it with an ORR instruction followed +// by an EOR instruction. +// +// This encoding allows all remaining repeated byte patterns, and many repeated +// 16-bit values, to be encoded without needing four instructions. It can also +// represent some irregular bitmasks (although those would mostly only need +// three instructions otherwise). +static bool tryEorOfLogicalImmediates(uint64_t Imm, + SmallVectorImpl &Insn) { + // Determine the larger repetition size of the two possible logical + // immediates, by finding the repetition size of Imm. + unsigned BigSize = 64; + + do { + BigSize /= 2; + uint64_t Mask = (1ULL << BigSize) - 1; + + if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) { + BigSize *= 2; + break; + } + } while (BigSize > 2); + + uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize); + + // Find the last bit of each run of ones, circularly. For runs which wrap + // around from bit 0 to bit 63, this is the bit before the most-significant + // zero, otherwise it is the least-significant bit in the run of ones. + uint64_t RunStarts = Imm & ~rotl(Imm, 1); + + // Find the smaller repetition size of the two possible logical immediates by + // counting the number of runs of one-bits within the BigSize-bit value. Both + // sizes may be the same. The EOR may add one or subtract one from the + // power-of-two count that can be represented by a logical immediate, or it + // may be left unchanged. + int RunsPerBigChunk = popcount(RunStarts & BigMask); + + static const int8_t BigToSmallSizeTable[32] = { + -1, -1, 0, 1, 2, 2, -1, 3, 3, 3, -1, -1, -1, -1, -1, 4, + 4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, + }; + + int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk]; + + // Early-exit if the big chunk couldn't be a power-of-two number of runs + // EORed with another single run. + if (BigToSmallShift == -1) + return false; + + unsigned SmallSize = BigSize >> BigToSmallShift; + + // 64-bit values with a bit set every (1 << index) bits. + static const uint64_t RepeatedOnesTable[] = { + 0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111, + 0x0101010101010101, 0x0001000100010001, 0x0000000100000001, + 0x0000000000000001, + }; + + // This RepeatedOnesTable lookup is a faster implementation of the division + // 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as + // dividing the 64-bit value into fields of width SmallSize, and placing a + // one in the least significant bit of each field. + uint64_t SmallOnes = RepeatedOnesTable[countr_zero(SmallSize)]; + + // Now we try to find the number of ones in each of the smaller repetitions, + // by looking at runs of ones in Imm. This can take three attempts, as the + // EOR may have changed the length of the first two runs we find. + + // Rotate a run of ones so we can count the number of trailing set bits. + int Rotation = countr_zero(RunStarts); + uint64_t RotatedImm = rotr(Imm, Rotation); + for (int Attempt = 0; Attempt < 3; ++Attempt) { + unsigned RunLength = countr_one(RotatedImm); + + // Construct candidate values BigImm and SmallImm, such that if these two + // values are encodable, we have a solution. (SmallImm is constructed to be + // encodable, but this isn't guaranteed when RunLength >= SmallSize) + uint64_t SmallImm = + rotl((SmallOnes << RunLength) - SmallOnes, Rotation); + uint64_t BigImm = Imm ^ SmallImm; + + uint64_t BigEncoding = 0; + uint64_t SmallEncoding = 0; + if (AArch64_AM::processLogicalImmediate(BigImm, 64, BigEncoding) && + AArch64_AM::processLogicalImmediate(SmallImm, 64, SmallEncoding)) { + Insn.push_back({AArch64::ORRXri, 0, SmallEncoding}); + Insn.push_back({AArch64::EORXri, 1, BigEncoding}); + return true; + } + + // Rotate to the next run of ones + Rotation += countr_zero(rotr(RunStarts, Rotation) & ~1); + RotatedImm = rotr(Imm, Rotation); + } + + return false; +} + /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions. static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize, @@ -503,6 +602,10 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize, if (tryAndOfLogicalImmediates(Imm, Insn)) return; + // Attempt to use a sequence of ORR-immediate followed by EOR-immediate. + if (tryEorOfLogicalImmediates(UImm, Insn)) + return; + // FIXME: Add more two-instruction sequences. // Three instruction sequences. diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index cc61373d51d718..38b5eeaf4057a7 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -171,6 +171,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, } break; case AArch64::ANDXri: + case AArch64::EORXri: if (I->Op1 == 0) { MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) .add(MI.getOperand(0)) diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll index 2ec58dbee02325..8ec98b74429718 100644 --- a/llvm/test/CodeGen/AArch64/arm64-movi.ll +++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll @@ -432,3 +432,120 @@ define i64 @orr_64_orr_8() nounwind { ; CHECK-NEXT: ret ret i64 -5764607889538110806 } + +define i64 @orr_2_eor_16() nounwind { +; CHECK-LABEL: orr_2_eor_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #6148914691236517205 +; CHECK-NEXT: eor x0, x0, #0x3000300030003000 +; CHECK-NEXT: ret + ret i64 7301853788297848149 +} + +define i64 @orr_2_eor_32() nounwind { +; CHECK-LABEL: orr_2_eor_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #6148914691236517205 +; CHECK-NEXT: eor x0, x0, #0x1fffc0001fffc0 +; CHECK-NEXT: ret + ret i64 6145912199858268821 +} + +define i64 @orr_2_eor_64() nounwind { +; CHECK-LABEL: orr_2_eor_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #6148914691236517205 +; CHECK-NEXT: eor x0, x0, #0x1fffffffffc00 +; CHECK-NEXT: ret + ret i64 6148727041252043093 +} + +define i64 @orr_4_eor_8() nounwind { +; CHECK-LABEL: orr_4_eor_8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #2459565876494606882 +; CHECK-NEXT: eor x0, x0, #0x8f8f8f8f8f8f8f8f +; CHECK-NEXT: ret + ret i64 12514849900987264429 +} + +define i64 @orr_4_eor_16() nounwind { +; CHECK-LABEL: orr_4_eor_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #4919131752989213764 +; CHECK-NEXT: eor x0, x0, #0xf00ff00ff00ff00f +; CHECK-NEXT: ret + ret i64 12991675787320734795 +} + +define i64 @orr_4_eor_32() nounwind { +; CHECK-LABEL: orr_4_eor_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #4919131752989213764 +; CHECK-NEXT: eor x0, x0, #0x1ff800001ff80000 +; CHECK-NEXT: ret + ret i64 6610233413460575300 +} + +define i64 @orr_4_eor_64() nounwind { +; CHECK-LABEL: orr_4_eor_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1229782938247303441 +; CHECK-NEXT: eor x0, x0, #0xfff80000000 +; CHECK-NEXT: ret + ret i64 1229798183233720593 +} + +define i64 @orr_8_eor_16() nounwind { +; CHECK-LABEL: orr_8_eor_16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #3472328296227680304 +; CHECK-NEXT: eor x0, x0, #0x1f801f801f801f80 +; CHECK-NEXT: ret + ret i64 3436298949444513712 +} + +define i64 @orr_8_eor_32() nounwind { +; CHECK-LABEL: orr_8_eor_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1157442765409226768 +; CHECK-NEXT: eor x0, x0, #0xffff8001ffff8001 +; CHECK-NEXT: ret + ret i64 17289195901212921873 +} + +define i64 @orr_8_eor_64() nounwind { +; CHECK-LABEL: orr_8_eor_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #3472328296227680304 +; CHECK-NEXT: eor x0, x0, #0x3ffffffff00000 +; CHECK-NEXT: ret + ret i64 3463215129921859632 +} + +define i64 @orr_16_eor_32() nounwind { +; CHECK-LABEL: orr_16_eor_32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1143931760365539296 +; CHECK-NEXT: eor x0, x0, #0xffff0001ffff0001 +; CHECK-NEXT: ret + ret i64 17302565756451360737 +} + +define i64 @orr_16_eor_64() nounwind { +; CHECK-LABEL: orr_16_eor_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #9214505439794855904 +; CHECK-NEXT: eor x0, x0, #0xfe000 +; CHECK-NEXT: ret + ret i64 9214505439795847136 +} + +define i64 @orr_32_eor_64() nounwind { +; CHECK-LABEL: orr_32_eor_64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, #1030792151280 +; CHECK-NEXT: eor x0, x0, #0xffff8000003fffff +; CHECK-NEXT: ret + ret i64 18446604367017541391 +}