Skip to content

Commit

Permalink
cmd/compile: don't use BTS when OR works, add direct memory BTS opera…
Browse files Browse the repository at this point in the history
…tions

Stop using BTSconst and friends when ORLconst can be used instead.
OR can be issued by more function units than BTS can, so it could
lead to better IPC. OR might take a few more bytes to encode, but
not a lot more.

Still use BTSconst for cases where the constant otherwise wouldn't
fit and would require a separate movabs instruction to materialize
the constant. This happens when setting bits 31-63 of 64-bit targets.

Add BTS-to-memory operations so we don't need to load/bts/store.

Fixes #61694

Change-Id: I00379608df8fb0167cb01466e97d11dec7c1596c
Reviewed-on: https://go-review.googlesource.com/c/go/+/515755
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
  • Loading branch information
randall77 committed Aug 4, 2023
1 parent 51cb12e commit 611706b
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 671 deletions.
9 changes: 5 additions & 4 deletions src/cmd/compile/internal/amd64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -714,9 +714,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Offset = v.AuxInt
case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst,
ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst,
ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst:
ssa.OpAMD64BTSQconst,
ssa.OpAMD64BTCQconst,
ssa.OpAMD64BTRQconst:
op := v.Op
if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
// Emit 32-bit version because it's shorter
Expand Down Expand Up @@ -851,7 +851,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
}
fallthrough
case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
sc := v.AuxValAndOff()
off := sc.Off64()
val := sc.Val64()
Expand Down
71 changes: 16 additions & 55 deletions src/cmd/compile/internal/ssa/_gen/AMD64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@
(Ctz32 x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
(Ctz64 <t> x) && buildcfg.GOAMD64 < 3 => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
(Ctz32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
(Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x))
(Ctz8 x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x))
(Ctz16 x) => (BSFL (ORLconst <typ.UInt32> [1<<16] x))
(Ctz8 x) => (BSFL (ORLconst <typ.UInt32> [1<<8 ] x))

(Ctz64NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x)
(Ctz32NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
Expand Down Expand Up @@ -659,29 +659,16 @@
// Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b)
(OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y)
(XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y)

// Convert ORconst into BTS, if the code gets smaller, with boundary being
// (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes).
((ORQ|XORQ)const [c] x) && isUint64PowerOfTwo(int64(c)) && uint64(c) >= 128
=> (BT(S|C)Qconst [int8(log32(c))] x)
((ORL|XORL)const [c] x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
=> (BT(S|C)Lconst [int8(log32(c))] x)
((ORQ|XORQ) (MOVQconst [c]) x) && isUint64PowerOfTwo(c) && uint64(c) >= 128
=> (BT(S|C)Qconst [int8(log64(c))] x)
((ORL|XORL) (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(c)) && uint64(c) >= 128
=> (BT(S|C)Lconst [int8(log32(c))] x)
// Note: only convert OR/XOR to BTS/BTC if the constant wouldn't fit in
// the constant field of the OR/XOR instruction. See issue 61694.
((OR|XOR)Q (MOVQconst [c]) x) && isUint64PowerOfTwo(c) && uint64(c) >= 1<<31 => (BT(S|C)Qconst [int8(log64(c))] x)

// Recognize bit clearing: a &^= 1<<b
(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y)
(ANDN(Q|L) x (SHL(Q|L) (MOV(Q|L)const [1]) y)) => (BTR(Q|L) x y)
(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128
=> (BTRQconst [int8(log32(^c))] x)
(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
=> (BTRLconst [int8(log32(^c))] x)
(ANDQ (MOVQconst [c]) x) && isUint64PowerOfTwo(^c) && uint64(^c) >= 128
=> (BTRQconst [int8(log64(^c))] x)
(ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
=> (BTRLconst [int8(log32(^c))] x)
// Note: only convert AND to BTR if the constant wouldn't fit in
// the constant field of the AND instruction. See issue 61694.
(ANDQ (MOVQconst [c]) x) && isUint64PowerOfTwo(^c) && uint64(^c) >= 1<<31 => (BTRQconst [int8(log64(^c))] x)

// Special-case bit patterns on first/last bit.
// generic.rules changes ANDs of high-part/low-part masks into a couple of shifts,
Expand All @@ -695,9 +682,9 @@

// Special case resetting first/last bit
(SHL(L|Q)const [1] (SHR(L|Q)const [1] x))
=> (BTR(L|Q)const [0] x)
=> (AND(L|Q)const [-2] x)
(SHRLconst [1] (SHLLconst [1] x))
=> (BTRLconst [31] x)
=> (ANDLconst [0x7fffffff] x)
(SHRQconst [1] (SHLQconst [1] x))
=> (BTRQconst [63] x)

Expand Down Expand Up @@ -731,10 +718,10 @@
=> (SET(B|AE)store [off] {sym} ptr (BTLconst [31] x) mem)

// Fold combinations of bit ops on same bit. An example is math.Copysign(c,-1)
(BTS(Q|L)const [c] (BTR(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
(BTS(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTS(Q|L)const [c] x)
(BTR(Q|L)const [c] (BTS(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
(BTR(Q|L)const [c] (BTC(Q|L)const [c] x)) => (BTR(Q|L)const [c] x)
(BTSQconst [c] (BTRQconst [c] x)) => (BTSQconst [c] x)
(BTSQconst [c] (BTCQconst [c] x)) => (BTSQconst [c] x)
(BTRQconst [c] (BTSQconst [c] x)) => (BTRQconst [c] x)
(BTRQconst [c] (BTCQconst [c] x)) => (BTRQconst [c] x)

// Fold boolean negation into SETcc.
(XORLconst [1] (SETNE x)) => (SETEQ x)
Expand Down Expand Up @@ -778,31 +765,6 @@
(XOR(L|Q)const [c] (XOR(L|Q)const [d] x)) => (XOR(L|Q)const [c ^ d] x)
(OR(L|Q)const [c] (OR(L|Q)const [d] x)) => (OR(L|Q)const [c | d] x)

(BTRLconst [c] (ANDLconst [d] x)) => (ANDLconst [d &^ (1<<uint32(c))] x)
(ANDLconst [c] (BTRLconst [d] x)) => (ANDLconst [c &^ (1<<uint32(d))] x)
(BTRLconst [c] (BTRLconst [d] x)) => (ANDLconst [^(1<<uint32(c) | 1<<uint32(d))] x)

(BTCLconst [c] (XORLconst [d] x)) => (XORLconst [d ^ 1<<uint32(c)] x)
(XORLconst [c] (BTCLconst [d] x)) => (XORLconst [c ^ 1<<uint32(d)] x)
(BTCLconst [c] (BTCLconst [d] x)) => (XORLconst [1<<uint32(c) | 1<<uint32(d)] x)

(BTSLconst [c] (ORLconst [d] x)) => (ORLconst [d | 1<<uint32(c)] x)
(ORLconst [c] (BTSLconst [d] x)) => (ORLconst [c | 1<<uint32(d)] x)
(BTSLconst [c] (BTSLconst [d] x)) => (ORLconst [1<<uint32(c) | 1<<uint32(d)] x)

(BTRQconst [c] (ANDQconst [d] x)) && is32Bit(int64(d) &^ (1<<uint32(c))) => (ANDQconst [d &^ (1<<uint32(c))] x)
(ANDQconst [c] (BTRQconst [d] x)) && is32Bit(int64(c) &^ (1<<uint32(d))) => (ANDQconst [c &^ (1<<uint32(d))] x)
(BTRQconst [c] (BTRQconst [d] x)) && is32Bit(^(1<<uint32(c) | 1<<uint32(d))) => (ANDQconst [^(1<<uint32(c) | 1<<uint32(d))] x)

(BTCQconst [c] (XORQconst [d] x)) && is32Bit(int64(d) ^ 1<<uint32(c)) => (XORQconst [d ^ 1<<uint32(c)] x)
(XORQconst [c] (BTCQconst [d] x)) && is32Bit(int64(c) ^ 1<<uint32(d)) => (XORQconst [c ^ 1<<uint32(d)] x)
(BTCQconst [c] (BTCQconst [d] x)) && is32Bit(1<<uint32(c) ^ 1<<uint32(d)) => (XORQconst [1<<uint32(c) ^ 1<<uint32(d)] x)

(BTSQconst [c] (ORQconst [d] x)) && is32Bit(int64(d) | 1<<uint32(c)) => (ORQconst [d | 1<<uint32(c)] x)
(ORQconst [c] (BTSQconst [d] x)) && is32Bit(int64(c) | 1<<uint32(d)) => (ORQconst [c | 1<<uint32(d)] x)
(BTSQconst [c] (BTSQconst [d] x)) && is32Bit(1<<uint32(c) | 1<<uint32(d)) => (ORQconst [1<<uint32(c) | 1<<uint32(d)] x)


(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)
(MULQconst [c] (MULQconst [d] x)) && is32Bit(int64(c)*int64(d)) => (MULQconst [c * d] x)

Expand Down Expand Up @@ -1422,11 +1384,8 @@
(NOTQ (MOVQconst [c])) => (MOVQconst [^c])
(NOTL (MOVLconst [c])) => (MOVLconst [^c])
(BTSQconst [c] (MOVQconst [d])) => (MOVQconst [d|(1<<uint32(c))])
(BTSLconst [c] (MOVLconst [d])) => (MOVLconst [d|(1<<uint32(c))])
(BTRQconst [c] (MOVQconst [d])) => (MOVQconst [d&^(1<<uint32(c))])
(BTRLconst [c] (MOVLconst [d])) => (MOVLconst [d&^(1<<uint32(c))])
(BTCQconst [c] (MOVQconst [d])) => (MOVQconst [d^(1<<uint32(c))])
(BTCLconst [c] (MOVLconst [d])) => (MOVLconst [d^(1<<uint32(c))])

// If c or d doesn't fit into 32 bits, then we can't construct ORQconst,
// but we can still constant-fold.
Expand Down Expand Up @@ -1513,6 +1472,8 @@
(MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
(MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem)
(MOVQstore {sym} [off] ptr x:(BT(S|R|C)Qconst [c] l:(MOVQload {sym} [off] ptr mem)) mem) && x.Uses == 1 && l.Uses == 1 && clobber(x, l) =>
(BT(S|R|C)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem)

// Merge ADDQconst and LEAQ into atomic loads.
(MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
Expand Down
27 changes: 21 additions & 6 deletions src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,12 +399,27 @@ func init() {
{name: "BTSQ", argLength: 2, reg: gp21, asm: "BTSQ", resultInArg0: true, clobberFlags: true}, // set bit arg1%64 in arg0
{name: "BTLconst", argLength: 1, reg: gp1flags, asm: "BTL", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 32
{name: "BTQconst", argLength: 1, reg: gp1flags, asm: "BTQ", typ: "Flags", aux: "Int8"}, // test whether bit auxint in arg0 is set, 0 <= auxint < 64
{name: "BTCLconst", argLength: 1, reg: gp11, asm: "BTCL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 32
{name: "BTCQconst", argLength: 1, reg: gp11, asm: "BTCQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 0 <= auxint < 64
{name: "BTRLconst", argLength: 1, reg: gp11, asm: "BTRL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 32
{name: "BTRQconst", argLength: 1, reg: gp11, asm: "BTRQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 0 <= auxint < 64
{name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32
{name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64
{name: "BTCQconst", argLength: 1, reg: gp11, asm: "BTCQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // complement bit auxint in arg0, 31 <= auxint < 64
{name: "BTRQconst", argLength: 1, reg: gp11, asm: "BTRQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // reset bit auxint in arg0, 31 <= auxint < 64
{name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 31 <= auxint < 64

// BT[SRC]Qconstmodify
//
// S: set bit
// R: reset (clear) bit
// C: complement bit
//
// Apply operation to bit ValAndOff(AuxInt).Val() in the 64 bits at
// memory address arg0+ValAndOff(AuxInt).Off()+aux
// Bit index must be in range (31-63).
// (We use OR/AND/XOR for thinner targets and lower bit indexes.)
// arg1=mem, returns mem
//
// Note that there aren't non-const versions of these instructions.
// Well, there are such instructions, but they are slow and weird so we don't use them.
{name: "BTSQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
{name: "BTRQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},
{name: "BTCQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"},

// TESTx: compare (arg0 & arg1) to 0
{name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"},
Expand Down
72 changes: 33 additions & 39 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 611706b

Please sign in to comment.