Skip to content

Commit

Permalink
s2: Improve speed with bigger output margin (#395)
Browse files Browse the repository at this point in the history
Use bigger output margin and utilize it to reduce memmove branching.
  • Loading branch information
klauspost authored Jun 9, 2021
1 parent f118b5f commit 893eb62
Show file tree
Hide file tree
Showing 3 changed files with 261 additions and 889 deletions.
101 changes: 68 additions & 33 deletions s2/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ func main() {
Constraint(buildtags.Term("gc").ToConstraint())

o := options{
snappy: false,
snappy: false,
outputMargin: 9,
}
o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B)
o.genEncodeBlockAsm("encodeBlockAsm4MB", 14, 6, 6, 4<<20)
o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B)
o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B)
o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B)

o.outputMargin = 6
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B)
Expand All @@ -48,12 +50,14 @@ func main() {

// Snappy compatible
o.snappy = true
o.outputMargin = 9
o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B)
o.genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, limit12B)
o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B)
o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)

o.snappy = false
o.outputMargin = 0
o.maxLen = math.MaxUint32
o.genEmitLiteral()
o.genEmitRepeat()
Expand Down Expand Up @@ -98,9 +102,10 @@ func assert(fn func(ok LabelRef)) {
}

type options struct {
snappy bool
vmbi2 bool
maxLen int
snappy bool
vmbi2 bool
maxLen int
outputMargin int // Should be at least 5.
}

func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) {
Expand Down Expand Up @@ -190,7 +195,7 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m
const inputMargin = 8
tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
MOVQ(lenSrcQ, tmp)
LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
LEAQ(Mem{Base: tmp, Disp: -o.outputMargin}, tmp2)
// sLimitL := len(src) - inputMargin
LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)

Expand All @@ -201,12 +206,12 @@ func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, m

MOVL(tmp3.As32(), sLimitL)

// dstLimit := (len(src) - 5 ) - len(src)>>5
// dstLimit := (len(src) - outputMargin ) - len(src)>>5
SHRQ(U8(5), tmp)
SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp

assert(func(ok LabelRef) {
// if len(src) > len(src) - len(src)>>5 - 5: ok
// if len(src) > len(src) - len(src)>>5 - outputMargin: ok
CMPQ(lenSrcQ, tmp2)
JGE(ok)
})
Expand Down Expand Up @@ -841,7 +846,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
const inputMargin = 8
tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
MOVQ(lenSrcQ, tmp)
LEAQ(Mem{Base: tmp, Disp: -6}, tmp2)
LEAQ(Mem{Base: tmp, Disp: -o.outputMargin}, tmp2)
// sLimitL := len(src) - inputMargin
LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)

Expand Down Expand Up @@ -1519,7 +1524,7 @@ func (o options) genEmitLiteral() {
TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
"It assumes that:",
" dst is long enough to hold the encoded bytes",
fmt.Sprintf(" dst is long enough to hold the encoded bytes with margin of %d bytes", o.outputMargin),
" 0 <= len(lit) && len(lit) <= math.MaxUint32", "")
Pragma("noescape")

Expand Down Expand Up @@ -1647,8 +1652,11 @@ func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.G
length := GP64()
MOVL(litLen.As32(), length.As32())

// We wrote one byte, we have that less in output margin.
o.outputMargin--
// updates litBase.
o.genMemMoveShort("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd)
o.outputMargin++

if updateDst {
Label("memmove_end_copy_" + name)
Expand Down Expand Up @@ -2067,12 +2075,22 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en
TESTQ(length, length)
JNZ(ok)
})
Label(name + "tail")
CMPQ(length, U8(3))
JB(LabelRef(name + "move_1or2"))
JE(LabelRef(name + "move_3"))
CMPQ(length, U8(8))
JB(LabelRef(name + "move_4through7"))

if o.outputMargin <= 3 {
CMPQ(length, U8(3))
JB(LabelRef(name + "move_1or2"))
JE(LabelRef(name + "move_3"))
} else if o.outputMargin >= 4 && o.outputMargin < 8 {
CMPQ(length, U8(4))
JLE(LabelRef(name + "move_4"))
}
if o.outputMargin <= 7 {
CMPQ(length, U8(8))
JB(LabelRef(name + "move_4through7"))
} else if o.outputMargin >= 8 {
CMPQ(length, U8(8))
JLE(LabelRef(name + "move_8"))
}
CMPQ(length, U8(16))
JBE(LabelRef(name + "move_8through16"))
CMPQ(length, U8(32))
Expand All @@ -2086,26 +2104,43 @@ func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, en

//genMemMoveLong(name, dst, src, length, end)

Label(name + "move_1or2")
MOVB(Mem{Base: src}, AX.As8())
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
MOVB(AX.As8(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
JMP(end)
if o.outputMargin <= 3 {
Label(name + "move_1or2")
MOVB(Mem{Base: src}, AX.As8())
MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
MOVB(AX.As8(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
JMP(end)

Label(name + "move_3")
MOVW(Mem{Base: src}, AX.As16())
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
MOVW(AX.As16(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
JMP(end)
Label(name + "move_3")
MOVW(Mem{Base: src}, AX.As16())
MOVB(Mem{Base: src, Disp: 2}, CX.As8())
MOVW(AX.As16(), Mem{Base: dst})
MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
JMP(end)
}

Label(name + "move_4through7")
MOVL(Mem{Base: src}, AX.As32())
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
MOVL(AX.As32(), Mem{Base: dst})
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
JMP(end)
if o.outputMargin >= 4 && o.outputMargin < 8 {
// Use single move.
Label(name + "move_4")
MOVL(Mem{Base: src}, AX.As32())
MOVL(AX.As32(), Mem{Base: dst})
JMP(end)
}
if o.outputMargin < 8 {
Label(name + "move_4through7")
MOVL(Mem{Base: src}, AX.As32())
MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
MOVL(AX.As32(), Mem{Base: dst})
MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
JMP(end)
} else {
// Use single move.
Label(name + "move_8")
MOVQ(Mem{Base: src}, AX)
MOVQ(AX, Mem{Base: dst})
JMP(end)
}

Label(name + "move_8through16")
MOVQ(Mem{Base: src}, AX)
Expand Down
2 changes: 1 addition & 1 deletion s2/encodeblock_amd64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 893eb62

Please sign in to comment.