From 1ff2eb1d866a7b50e1d98af2737555e9d8b7e223 Mon Sep 17 00:00:00 2001
From: Dougall Johnson <dougallj@gmail.com>
Date: Thu, 5 Oct 2023 18:53:40 +1100
Subject: [PATCH] [Aarch64] Materialize immediates with 64-bit ORR + EOR if
 shorter

A number of useful constants can be encoded with a 64-bit ORR followed
by a 64-bit EOR, including all remaining repeated byte patterns, some
useful repeated 16-bit patterns, and some irregular masks. This patch
prioritizes that encoding over three or four instruction encodings.
Encoding with MOV + MOVK or ORR + MOVK is still preferred for fast
literal generation and readability respectively.

The method devises three candidate values, and checks if both Candidate
and (Imm ^ Candidate) are valid logical immediates. If so, Imm is
materialized with:

```
ORR Xd, XZR, #(Imm ^ Candidate)
EOR Xd, Xd, #(Candidate)
```

The method has been exhaustively tested to ensure it can solve all
possible values (excluding 0, ~0, and plain logical immediates, which
are handled earlier).
---
 llvm/lib/Target/AArch64/AArch64ExpandImm.cpp  | 103 +++++++++++++++
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   1 +
 llvm/test/CodeGen/AArch64/arm64-movi.ll       | 117 ++++++++++++++++++
 3 files changed, 221 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index 731972a039ba44..a7d72b59b1d5a6 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -362,6 +362,105 @@ static bool tryAndOfLogicalImmediates(uint64_t UImm,
   return false;
 }
 
+// Check whether the constant can be represented by exclusive-or of two 64-bit
+// logical immediates. If so, materialize it with an ORR instruction followed
+// by an EOR instruction.
+//
+// This encoding allows all remaining repeated byte patterns, and many repeated
+// 16-bit values, to be encoded without needing four instructions. It can also
+// represent some irregular bitmasks (although those would mostly only need
+// three instructions otherwise).
+static bool tryEorOfLogicalImmediates(uint64_t Imm,
+                                      SmallVectorImpl<ImmInsnModel> &Insn) {
+  // Determine the larger repetition size of the two possible logical
+  // immediates, by finding the repetition size of Imm.
+  unsigned BigSize = 64;
+
+  do {
+    BigSize /= 2;
+    uint64_t Mask = (1ULL << BigSize) - 1;
+
+    if ((Imm & Mask) != ((Imm >> BigSize) & Mask)) {
+      BigSize *= 2;
+      break;
+    }
+  } while (BigSize > 2);
+
+  uint64_t BigMask = ((uint64_t)-1LL) >> (64 - BigSize);
+
+  // Find the last bit of each run of ones, circularly. For runs which wrap
+  // around from bit 0 to bit 63, this is the bit before the most-significant
+  // zero, otherwise it is the least-significant bit in the run of ones.
+  uint64_t RunStarts = Imm & ~rotl<uint64_t>(Imm, 1);
+
+  // Find the smaller repetition size of the two possible logical immediates by
+  // counting the number of runs of one-bits within the BigSize-bit value. Both
+  // sizes may be the same. The EOR may add one or subtract one from the
+  // power-of-two count that can be represented by a logical immediate, or it
+  // may be left unchanged.
+  int RunsPerBigChunk = popcount(RunStarts & BigMask);
+
+  static const int8_t BigToSmallSizeTable[32] = {
+      -1, -1, 0,  1,  2,  2,  -1, 3,  3,  3,  -1, -1, -1, -1, -1, 4,
+      4,  4,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5,
+  };
+
+  int BigToSmallShift = BigToSmallSizeTable[RunsPerBigChunk];
+
+  // Early-exit if the big chunk couldn't be a power-of-two number of runs
+  // EORed with another single run.
+  if (BigToSmallShift == -1)
+    return false;
+
+  unsigned SmallSize = BigSize >> BigToSmallShift;
+
+  // 64-bit values with a bit set every (1 << index) bits.
+  static const uint64_t RepeatedOnesTable[] = {
+      0xffffffffffffffff, 0x5555555555555555, 0x1111111111111111,
+      0x0101010101010101, 0x0001000100010001, 0x0000000100000001,
+      0x0000000000000001,
+  };
+
+  // This RepeatedOnesTable lookup is a faster implementation of the division
+  // 0xffffffffffffffff / ((1 << SmallSize) - 1), and can be thought of as
+  // dividing the 64-bit value into fields of width SmallSize, and placing a
+  // one in the least significant bit of each field.
+  uint64_t SmallOnes = RepeatedOnesTable[countr_zero(SmallSize)];
+
+  // Now we try to find the number of ones in each of the smaller repetitions,
+  // by looking at runs of ones in Imm. This can take three attempts, as the
+  // EOR may have changed the length of the first two runs we find.
+
+  // Rotate a run of ones so we can count the number of trailing set bits.
+  int Rotation = countr_zero(RunStarts);
+  uint64_t RotatedImm = rotr<uint64_t>(Imm, Rotation);
+  for (int Attempt = 0; Attempt < 3; ++Attempt) {
+    unsigned RunLength = countr_one(RotatedImm);
+
+    // Construct candidate values BigImm and SmallImm, such that if these two
+    // values are encodable, we have a solution. (SmallImm is constructed to be
+    // encodable, but this isn't guaranteed when RunLength >= SmallSize)
+    uint64_t SmallImm =
+        rotl<uint64_t>((SmallOnes << RunLength) - SmallOnes, Rotation);
+    uint64_t BigImm = Imm ^ SmallImm;
+
+    uint64_t BigEncoding = 0;
+    uint64_t SmallEncoding = 0;
+    if (AArch64_AM::processLogicalImmediate(BigImm, 64, BigEncoding) &&
+        AArch64_AM::processLogicalImmediate(SmallImm, 64, SmallEncoding)) {
+      Insn.push_back({AArch64::ORRXri, 0, SmallEncoding});
+      Insn.push_back({AArch64::EORXri, 1, BigEncoding});
+      return true;
+    }
+
+    // Rotate to the next run of ones
+    Rotation += countr_zero(rotr<uint64_t>(RunStarts, Rotation) & ~1);
+    RotatedImm = rotr<uint64_t>(Imm, Rotation);
+  }
+
+  return false;
+}
+
 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
 /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
 static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
@@ -503,6 +602,10 @@ void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
   if (tryAndOfLogicalImmediates(Imm, Insn))
     return;
 
+  // Attempt to use a sequence of ORR-immediate followed by EOR-immediate.
+  if (tryEorOfLogicalImmediates(UImm, Insn))
+    return;
+
   // FIXME: Add more two-instruction sequences.
 
   // Three instruction sequences.
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index cc61373d51d718..38b5eeaf4057a7 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -171,6 +171,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
       }
       break;
     case AArch64::ANDXri:
+    case AArch64::EORXri:
       if (I->Op1 == 0) {
         MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
                            .add(MI.getOperand(0))
diff --git a/llvm/test/CodeGen/AArch64/arm64-movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll
index 2ec58dbee02325..8ec98b74429718 100644
--- a/llvm/test/CodeGen/AArch64/arm64-movi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-movi.ll
@@ -432,3 +432,120 @@ define i64 @orr_64_orr_8() nounwind {
 ; CHECK-NEXT:    ret
   ret i64 -5764607889538110806
 }
+
+define i64 @orr_2_eor_16() nounwind {
+; CHECK-LABEL: orr_2_eor_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #6148914691236517205
+; CHECK-NEXT:    eor  x0, x0, #0x3000300030003000
+; CHECK-NEXT:    ret
+  ret i64 7301853788297848149
+}
+
+define i64 @orr_2_eor_32() nounwind {
+; CHECK-LABEL: orr_2_eor_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #6148914691236517205
+; CHECK-NEXT:    eor  x0, x0, #0x1fffc0001fffc0
+; CHECK-NEXT:    ret
+  ret i64 6145912199858268821
+}
+
+define i64 @orr_2_eor_64() nounwind {
+; CHECK-LABEL: orr_2_eor_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #6148914691236517205
+; CHECK-NEXT:    eor  x0, x0, #0x1fffffffffc00
+; CHECK-NEXT:    ret
+  ret i64 6148727041252043093
+}
+
+define i64 @orr_4_eor_8() nounwind {
+; CHECK-LABEL: orr_4_eor_8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #2459565876494606882
+; CHECK-NEXT:    eor  x0, x0, #0x8f8f8f8f8f8f8f8f
+; CHECK-NEXT:    ret
+  ret i64 12514849900987264429
+}
+
+define i64 @orr_4_eor_16() nounwind {
+; CHECK-LABEL: orr_4_eor_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #4919131752989213764
+; CHECK-NEXT:    eor  x0, x0, #0xf00ff00ff00ff00f
+; CHECK-NEXT:    ret
+  ret i64 12991675787320734795
+}
+
+define i64 @orr_4_eor_32() nounwind {
+; CHECK-LABEL: orr_4_eor_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #4919131752989213764
+; CHECK-NEXT:    eor  x0, x0, #0x1ff800001ff80000
+; CHECK-NEXT:    ret
+  ret i64 6610233413460575300
+}
+
+define i64 @orr_4_eor_64() nounwind {
+; CHECK-LABEL: orr_4_eor_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #1229782938247303441
+; CHECK-NEXT:    eor  x0, x0, #0xfff80000000
+; CHECK-NEXT:    ret
+  ret i64 1229798183233720593
+}
+
+define i64 @orr_8_eor_16() nounwind {
+; CHECK-LABEL: orr_8_eor_16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #3472328296227680304
+; CHECK-NEXT:    eor  x0, x0, #0x1f801f801f801f80
+; CHECK-NEXT:    ret
+  ret i64 3436298949444513712
+}
+
+define i64 @orr_8_eor_32() nounwind {
+; CHECK-LABEL: orr_8_eor_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #1157442765409226768
+; CHECK-NEXT:    eor  x0, x0, #0xffff8001ffff8001
+; CHECK-NEXT:    ret
+  ret i64 17289195901212921873
+}
+
+define i64 @orr_8_eor_64() nounwind {
+; CHECK-LABEL: orr_8_eor_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #3472328296227680304
+; CHECK-NEXT:    eor  x0, x0, #0x3ffffffff00000
+; CHECK-NEXT:    ret
+  ret i64 3463215129921859632
+}
+
+define i64 @orr_16_eor_32() nounwind {
+; CHECK-LABEL: orr_16_eor_32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #1143931760365539296
+; CHECK-NEXT:    eor  x0, x0, #0xffff0001ffff0001
+; CHECK-NEXT:    ret
+  ret i64 17302565756451360737
+}
+
+define i64 @orr_16_eor_64() nounwind {
+; CHECK-LABEL: orr_16_eor_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #9214505439794855904
+; CHECK-NEXT:    eor  x0, x0, #0xfe000
+; CHECK-NEXT:    ret
+  ret i64 9214505439795847136
+}
+
+define i64 @orr_32_eor_64() nounwind {
+; CHECK-LABEL: orr_32_eor_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov  x0, #1030792151280
+; CHECK-NEXT:    eor  x0, x0, #0xffff8000003fffff
+; CHECK-NEXT:    ret
+  ret i64 18446604367017541391
+}