-
Notifications
You must be signed in to change notification settings - Fork 20.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
core/vm: reverse bit order in bytes of code bitmap #24120
Conversation
The benchmarks looks ok-ish if checking the whole change: Haswell 4.4 GHz
But if you inspect only the second commit which only removes the lookup table for
|
On a Zen3 under external load we got even bigger boost for non-PUSH benchmarks, but also a regression for PUSH1.
|
Assembly diff for lookup table removal. diff --git a/rev1.asm b/rev2.asm
index 4f7d7310d..c92d58357 100644
--- a/rev1.asm
+++ b/rev2.asm
@@ -12,7 +12,7 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVQ R9, CX
NOPL
CMPQ CX, BX
- JBE 0x62ad66
+ JBE 0x62ad3c
op := OpCode(code[pc])
MOVZX 0(AX)(CX*1), DX
pc++
@@ -20,49 +20,46 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
if op < PUSH1 || op > PUSH32 {
LEAL -0x60(DX), R10
CMPL $0x1f, R10
- JBE 0x62aac4
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
- continue
- JMP 0x62aa9c
+ JA 0x62aa9c
numbits := op - PUSH1 + 1
ADDL $-0x5f, DX
+ NOPW
if numbits >= 8 {
CMPL $0x8, DL
- JAE 0x62ae18
+ JAE 0x62aded
switch numbits {
CMPL $0x3, DL
case 3:
- JA 0x62abd4
+ JA 0x62abc5
case 1:
CMPL $0x1, DL
- JNE 0x62ab14
+ JNE 0x62ab03
bits.set1(pc)
NOPL
- bits[pos/8] |= lookup[pos%8]
+ bits[pos/8] |= 1 << (pos % 8)
MOVQ R9, DX
SHRQ $0x3, R9
+ NOPL
CMPQ R9, SI
- JBE 0x62adfe
+ JBE 0x62add3
MOVZX 0(DI)(R9*1), R10
MOVQ DX, R11
ANDQ $0x7, DX
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
- MOVZX 0(R12)(DX*1), DX
- ORL R10, DX
- MOVB DL, 0(DI)(R9*1)
+ BTSL DX, R10
+ MOVB R10, 0(DI)(R9*1)
pc += 1
LEAQ 0x1(R11), R9
JMP 0x62aa9c
case 2:
CMPL $0x2, DL
- JNE 0x62ab72
+ JNE 0x62ab60
bits.setN(set2BitsMask, pc)
NOPL
bits[pos/8] |= l
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62adf3
+ JBE 0x62adc8
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -76,27 +73,27 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
SHRW $0x8, R10
if h != 0 {
TESTL R10, R10
- JE 0x62ab62
+ JE 0x62ab51
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
CMPQ R11, SI
- JBE 0x62ade8
+ JBE 0x62adbd
MOVB R10, 0x1(R9)(DI*1)
pc += 2
LEAQ 0x2(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
JMP 0x62aa9c
+ NOPW 0(AX)(AX*1)
switch numbits {
CMPL $0x3, DL
case 3:
- JNE 0x62ad5a
+ JNE 0x62aa9c
bits.setN(set3BitsMask, pc)
NOPL
bits[pos/8] |= l
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62addd
+ JBE 0x62adb2
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -108,33 +105,34 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVB R11, 0(DI)(R9*1)
h := byte(a >> 8)
SHRW $0x8, R10
+ NOPL 0(AX)(AX*1)
if h != 0 {
TESTL R10, R10
- JE 0x62abc4
+ JE 0x62abb7
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
CMPQ R11, SI
- JBE 0x62add2
+ JBE 0x62ada7
MOVB R10, 0x1(R9)(DI*1)
pc += 3
LEAQ 0x3(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
+ NOPL 0(AX)(AX*1)
JMP 0x62aa9c
switch numbits {
CMPL $0x5, DL
case 5:
- JA 0x62aca0
- NOPL 0(AX)
+ JA 0x62ac85
case 4:
CMPL $0x4, DL
- JNE 0x62ac3e
+ JNE 0x62ac2a
bits.setN(set4BitsMask, pc)
NOPL
bits[pos/8] |= l
MOVQ R9, CX
SHRQ $0x3, R9
+ NOPL 0(AX)(AX*1)
CMPQ R9, SI
- JBE 0x62adc7
+ JBE 0x62ad9c
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -148,15 +146,14 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
SHRW $0x8, R10
if h != 0 {
TESTL R10, R10
- JE 0x62ac2e
+ JE 0x62ac21
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
CMPQ R11, SI
- JBE 0x62adbc
+ JBE 0x62ad91
MOVB R10, 0x1(R9)(DI*1)
pc += 4
LEAQ 0x4(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
JMP 0x62aa9c
bits.setN(set5BitsMask, pc)
NOPL
@@ -164,7 +161,7 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62adb1
+ JBE 0x62ad86
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -176,30 +173,29 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVB R11, 0(DI)(R9*1)
h := byte(a >> 8)
SHRW $0x8, R10
+ NOPL 0(AX)
if h != 0 {
TESTL R10, R10
- JE 0x62ac8e
+ JE 0x62ac77
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
- NOPL 0(AX)
CMPQ R11, SI
- JBE 0x62ada6
+ JBE 0x62ad7b
MOVB R10, 0x1(R9)(DI*1)
pc += 5
LEAQ 0x5(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
+ NOPL 0(AX)(AX*1)
JMP 0x62aa9c
- NOPW
case 6:
CMPL $0x6, DL
- JNE 0x62ad00
+ JNE 0x62ace5
bits.setN(set6BitsMask, pc)
NOPL
bits[pos/8] |= l
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62ad9b
+ JBE 0x62ad70
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -211,29 +207,29 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVB R11, 0(DI)(R9*1)
h := byte(a >> 8)
SHRW $0x8, R10
+ NOPL 0(AX)
if h != 0 {
TESTL R10, R10
- JE 0x62acee
+ JE 0x62acd7
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
CMPQ R11, SI
- JBE 0x62ad90
+ JBE 0x62ad65
MOVB R10, 0x1(R9)(DI*1)
pc += 6
LEAQ 0x6(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
+ NOPL 0(AX)(AX*1)
JMP 0x62aa9c
- NOPW
case 7:
CMPL $0x7, DL
- JNE 0x62ad5a
+ JNE 0x62aa9c
bits.setN(set7BitsMask, pc)
NOPL
bits[pos/8] |= l
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62ad85
+ JBE 0x62ad5a
a := flag << (pos % 8)
MOVQ CX, DX
ANDQ $0x7, CX
@@ -245,21 +241,17 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVB R11, 0(DI)(R9*1)
h := byte(a >> 8)
SHRW $0x8, R10
+ NOPL 0(AX)
if h != 0 {
TESTL R10, R10
- JE 0x62ad4a
+ JE 0x62ad33
bits[pos/8+1] = h
LEAQ 0x1(R9), R11
- NOPL 0(AX)
CMPQ R11, SI
- JBE 0x62ad79
+ JBE 0x62ad4f
MOVB R10, 0x1(R9)(DI*1)
pc += 7
LEAQ 0x7(DX), R9
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
- JMP 0x62aa9c
- LEAQ github.com/ethereum/go-ethereum/core/vm.lookup(SB), R12
- switch numbits {
JMP 0x62aa9c
return bits
MOVQ DI, AX
@@ -271,7 +263,6 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
bits[pos/8+1] = h
MOVQ R11, AX
MOVQ SI, CX
- NOPL
CALL runtime.panicIndexU(SB)
bits[pos/8] |= l
MOVQ R9, AX
@@ -317,7 +308,7 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
MOVQ R9, AX
MOVQ SI, CX
CALL runtime.panicIndexU(SB)
- bits[pos/8] |= lookup[pos%8]
+ bits[pos/8] |= 1 << (pos % 8)
MOVQ R9, AX
MOVQ SI, CX
CALL runtime.panicIndexU(SB)
@@ -330,14 +321,15 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
LEAQ 0x10(R10), R9
for ; numbits >= 16; numbits -= 16 {
CMPL $0x10, DL
- JB 0x62ae76
+ JB 0x62ae56
bits.set16(pc)
NOPL
bits[pos/8] |= a
MOVQ R9, CX
SHRQ $0x3, R9
+ NOPW 0(AX)(AX*1)
CMPQ R9, SI
- JBE 0x62aedd
+ JBE 0x62aebd
a := byte(0xFF << (pos % 8))
MOVQ CX, R10
ANDQ $0x7, CX
@@ -350,14 +342,14 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
bits[pos/8+1] = 0xFF
LEAQ 0x1(R9), R12
CMPQ R12, SI
- JBE 0x62aed2
+ JBE 0x62aeb2
MOVB $0xff, 0x1(R9)(DI*1)
bits[pos/8+2] = ^a
LEAQ 0x2(R9), R12
NOPL 0(AX)
CMPQ R12, SI
- JA 0x62ae09
- JMP 0x62aec7
+ JA 0x62adde
+ JMP 0x62aea7
bits[pos/8+1] = ^a
NOTL R11
MOVB R11, 0x1(R9)(DI*1)
@@ -367,14 +359,14 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
LEAQ 0x8(R10), R9
for ; numbits >= 8; numbits -= 8 {
CMPL $0x8, DL
- JB 0x62aad0
+ JB 0x62aac9
bits.set8(pc)
NOPL
bits[pos/8] |= a
MOVQ R9, CX
SHRQ $0x3, R9
CMPQ R9, SI
- JBE 0x62aebc
+ JBE 0x62ae9c
a := byte(0xFF << (pos % 8))
MOVQ CX, R10
ANDQ $0x7, CX
@@ -387,7 +379,7 @@ func codeBitmapInternal(code, bits bitvec) bitvec {
bits[pos/8+1] = ^a
LEAQ 0x1(R9), R12
CMPQ R12, SI
- JA 0x62ae67
+ JA 0x62ae47
MOVQ R12, AX
MOVQ SI, CX
CALL runtime.panicIndexU(SB) |
Why should reversing the bits be faster? (and with that I'm not being snarky and saying it isn't, I'm wondering what the theory-behind-the-scenes is) |
2917c6a
to
a4418ae
Compare
This bit order is more natural for bit manipulation operations and we can eliminate some small number of CPU instructions.
a4418ae
to
a864a7e
Compare
It is a bit more "natural" for some bit-manip CPU instructions. E.g. in func (bits bitvec) set1_x(pos uint64) {
bits[pos/8] |= 0x80 >> (pos%8)
} vs func (bits bitvec) set1(pos uint64) {
bits[pos/8] |= 1 << (pos%8)
} the core bit manipulation part MOVL $-128, BX
SHRB CL, BL
ORL BX, DX is replaced with a BTS instruction BTSL DX, CX |
One more benchmark results from a Skylake laptop.
|
On my laptop (
So in summary: the worst cases were improved or not changed. LGTM |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
* core/vm: reverse bit order in bytes of code bitmap This bit order is more natural for bit manipulation operations and we can eliminate some small number of CPU instructions. * core/vm: drop lookup table
* core/vm: reverse bit order in bytes of code bitmap This bit order is more natural for bit manipulation operations and we can eliminate some small number of CPU instructions. * core/vm: drop lookup table
* core/vm: reverse bit order in bytes of code bitmap This bit order is more natural for bit manipulation operations and we can eliminate some small number of CPU instructions. * core/vm: drop lookup table
This bit order is more natural for bit manipulation operations and we
can eliminate some small number of CPU instructions.