Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Aarch64] Materialize immediates with 64-bit ORR + EOR if shorter #68287

Merged
merged 1 commit into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,105 @@ static bool tryAndOfLogicalImmediates(uint64_t UImm,
return false;
}

// Check whether the constant can be represented by exclusive-or of two 64-bit
// logical immediates. If so, materialize it with an ORR instruction followed
// by an EOR instruction.
//
// This encoding allows all remaining repeated byte patterns, and many repeated
// 16-bit values, to be encoded without needing four instructions. It can also
// represent some irregular bitmasks (although those would mostly only need
// three instructions otherwise).
static bool tryEorOfLogicalImmediates(uint64_t Imm,
SmallVectorImpl<ImmInsnModel> &Insn) {
// Determine the larger repetition size of the two possible logical
// immediates, by finding the repetition size of Imm.
unsigned BigSize = 64;

do {
BigSize /= 2;
uint64_t Mask = (1ULL << BigSize) - 1;

if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) {
BigSize *= 2;
break;
}
} while (BigSize > 2);

uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize);

// Find the last bit of each run of ones, circularly. For runs which wrap
// around from bit 0 to bit 63, this is the bit before the most-significant
// zero, otherwise it is the least-significant bit in the run of ones.
uint64_t RunStarts = Imm & ~rotl<uint64_t>(Imm, 1);

// Find the smaller repetition size of the two possible logical immediates by
// counting the number of runs of one-bits within the BigSize-bit value. Both
// sizes may be the same. The EOR may add one or subtract one from the
// power-of-two count that can be represented by a logical immediate, or it
// may be left unchanged.
int RunsPerBigChunk = popcount(RunStarts & BigMask);

static const int8_t BigToSmallSizeTable[32] = {
-1, -1, 0, 1, 2, 2, -1, 3, 3, 3, -1, -1, -1, -1, -1, 4,
4, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5,
};

int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk];

// Early-exit if the big chunk couldn't be a power-of-two number of runs
// EORed with another single run.
if (BigToSmallShift == -1)
return false;

unsigned SmallSize = BigSize >> BigToSmallShift;

// 64-bit values with a bit set every (1 << index) bits.
static const uint64_t RepeatedOnesTable[] = {
0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111,
0x0101010101010101, 0x0001000100010001, 0x0000000100000001,
0x0000000000000001,
};

// This RepeatedOnesTable lookup is a faster implementation of the division
// 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as
// dividing the 64-bit value into fields of width SmallSize, and placing a
// one in the least significant bit of each field.
uint64_t SmallOnes = RepeatedOnesTable[countr_zero(SmallSize)];

// Now we try to find the number of ones in each of the smaller repetitions,
// by looking at runs of ones in Imm. This can take three attempts, as the
// EOR may have changed the length of the first two runs we find.
davemgreen marked this conversation as resolved.
Show resolved Hide resolved

// Rotate a run of ones so we can count the number of trailing set bits.
int Rotation = countr_zero(RunStarts);
uint64_t RotatedImm = rotr<uint64_t>(Imm, Rotation);
for (int Attempt = 0; Attempt < 3; ++Attempt) {
unsigned RunLength = countr_one(RotatedImm);

// Construct candidate values BigImm and SmallImm, such that if these two
// values are encodable, we have a solution. (SmallImm is constructed to be
// encodable, but this isn't guaranteed when RunLength >= SmallSize)
uint64_t SmallImm =
rotl<uint64_t>((SmallOnes << RunLength) - SmallOnes, Rotation);
uint64_t BigImm = Imm ^ SmallImm;

uint64_t BigEncoding = 0;
uint64_t SmallEncoding = 0;
if (AArch64_AM::processLogicalImmediate(BigImm, 64, BigEncoding) &&
AArch64_AM::processLogicalImmediate(SmallImm, 64, SmallEncoding)) {
Insn.push_back({AArch64::ORRXri, 0, SmallEncoding});
Insn.push_back({AArch64::EORXri, 1, BigEncoding});
return true;
}

// Rotate to the next run of ones
Rotation += countr_zero(rotr<uint64_t>(RunStarts, Rotation) & ~1);
RotatedImm = rotr<uint64_t>(Imm, Rotation);
}

return false;
}

/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
/// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
Expand Down Expand Up @@ -503,6 +602,10 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
if (tryAndOfLogicalImmediates(Imm, Insn))
return;

// Attempt to use a sequence of ORR-immediate followed by EOR-immediate.
if (tryEorOfLogicalImmediates(UImm, Insn))
return;

// FIXME: Add more two-instruction sequences.

// Three instruction sequences.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
}
break;
case AArch64::ANDXri:
case AArch64::EORXri:
if (I->Op1 == 0) {
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
.add(MI.getOperand(0))
Expand Down
117 changes: 117 additions & 0 deletions llvm/test/CodeGen/AArch64/arm64-movi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -432,3 +432,120 @@ define i64 @orr_64_orr_8() nounwind {
; CHECK-NEXT: ret
ret i64 -5764607889538110806
}

define i64 @orr_2_eor_16() nounwind {
; CHECK-LABEL: orr_2_eor_16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205
; CHECK-NEXT: eor x0, x0, #0x3000300030003000
; CHECK-NEXT: ret
ret i64 7301853788297848149
}

define i64 @orr_2_eor_32() nounwind {
; CHECK-LABEL: orr_2_eor_32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205
; CHECK-NEXT: eor x0, x0, #0x1fffc0001fffc0
; CHECK-NEXT: ret
ret i64 6145912199858268821
}

define i64 @orr_2_eor_64() nounwind {
; CHECK-LABEL: orr_2_eor_64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #6148914691236517205
; CHECK-NEXT: eor x0, x0, #0x1fffffffffc00
; CHECK-NEXT: ret
ret i64 6148727041252043093
}

define i64 @orr_4_eor_8() nounwind {
; CHECK-LABEL: orr_4_eor_8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #2459565876494606882
; CHECK-NEXT: eor x0, x0, #0x8f8f8f8f8f8f8f8f
; CHECK-NEXT: ret
ret i64 12514849900987264429
}

define i64 @orr_4_eor_16() nounwind {
; CHECK-LABEL: orr_4_eor_16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #4919131752989213764
; CHECK-NEXT: eor x0, x0, #0xf00ff00ff00ff00f
; CHECK-NEXT: ret
ret i64 12991675787320734795
}

define i64 @orr_4_eor_32() nounwind {
; CHECK-LABEL: orr_4_eor_32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #4919131752989213764
; CHECK-NEXT: eor x0, x0, #0x1ff800001ff80000
; CHECK-NEXT: ret
ret i64 6610233413460575300
}

define i64 @orr_4_eor_64() nounwind {
; CHECK-LABEL: orr_4_eor_64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #1229782938247303441
; CHECK-NEXT: eor x0, x0, #0xfff80000000
; CHECK-NEXT: ret
ret i64 1229798183233720593
}

define i64 @orr_8_eor_16() nounwind {
; CHECK-LABEL: orr_8_eor_16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #3472328296227680304
; CHECK-NEXT: eor x0, x0, #0x1f801f801f801f80
; CHECK-NEXT: ret
ret i64 3436298949444513712
}

define i64 @orr_8_eor_32() nounwind {
; CHECK-LABEL: orr_8_eor_32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #1157442765409226768
; CHECK-NEXT: eor x0, x0, #0xffff8001ffff8001
; CHECK-NEXT: ret
ret i64 17289195901212921873
}

define i64 @orr_8_eor_64() nounwind {
; CHECK-LABEL: orr_8_eor_64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #3472328296227680304
; CHECK-NEXT: eor x0, x0, #0x3ffffffff00000
; CHECK-NEXT: ret
ret i64 3463215129921859632
}

define i64 @orr_16_eor_32() nounwind {
; CHECK-LABEL: orr_16_eor_32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #1143931760365539296
; CHECK-NEXT: eor x0, x0, #0xffff0001ffff0001
; CHECK-NEXT: ret
ret i64 17302565756451360737
}

define i64 @orr_16_eor_64() nounwind {
; CHECK-LABEL: orr_16_eor_64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #9214505439794855904
; CHECK-NEXT: eor x0, x0, #0xfe000
; CHECK-NEXT: ret
ret i64 9214505439795847136
}

define i64 @orr_32_eor_64() nounwind {
; CHECK-LABEL: orr_32_eor_64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x0, #1030792151280
; CHECK-NEXT: eor x0, x0, #0xffff8000003fffff
; CHECK-NEXT: ret
ret i64 18446604367017541391
}