From 4a32b53c31750e102b781ea1cdcd2c1b03418139 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 11 Jan 2022 08:21:50 -0800 Subject: [PATCH] zstd: Minor performance tweaks (#420) These seem better in Go 1.17 --- zstd/blockenc.go | 11 ++-- zstd/enc_base.go | 24 +++++--- zstd/enc_fast.go | 139 +++---------------------------------------- zstd/encoder_test.go | 12 ++-- zstd/fse_encoder.go | 5 +- 5 files changed, 39 insertions(+), 152 deletions(-) diff --git a/zstd/blockenc.go b/zstd/blockenc.go index 3f7fa643df..12e8f6f0b6 100644 --- a/zstd/blockenc.go +++ b/zstd/blockenc.go @@ -51,7 +51,7 @@ func (b *blockEnc) init() { if cap(b.literals) < maxCompressedBlockSize { b.literals = make([]byte, 0, maxCompressedBlockSize) } - const defSeqs = 200 + const defSeqs = 2000 if cap(b.sequences) < defSeqs { b.sequences = make([]seq, 0, defSeqs) } @@ -426,7 +426,7 @@ func fuzzFseEncoder(data []byte) int { return 0 } enc := fseEncoder{} - hist := enc.Histogram()[:256] + hist := enc.Histogram() maxSym := uint8(0) for i, v := range data { v = v & 63 @@ -802,14 +802,13 @@ func (b *blockEnc) genCodes() { // nothing to do return } - if len(b.sequences) > math.MaxUint16 { panic("can only encode up to 64K sequences") } // No bounds checks after here: - llH := b.coders.llEnc.Histogram()[:256] - ofH := b.coders.ofEnc.Histogram()[:256] - mlH := b.coders.mlEnc.Histogram()[:256] + llH := b.coders.llEnc.Histogram() + ofH := b.coders.ofEnc.Histogram() + mlH := b.coders.mlEnc.Histogram() for i := range llH { llH[i] = 0 } diff --git a/zstd/enc_base.go b/zstd/enc_base.go index 295cd602a4..15ae8ee807 100644 --- a/zstd/enc_base.go +++ b/zstd/enc_base.go @@ -108,11 +108,6 @@ func (e *fastBase) UseBlock(enc *blockEnc) { e.blk = enc } -func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 { - // Extend the match to be as long as possible. - return int32(matchLen(src[s:], src[t:])) -} - func (e *fastBase) matchlen(s, t int32, src []byte) int32 { if debugAsserts { if s < 0 { @@ -131,9 +126,24 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 { panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize)) } } + a := src[s:] + b := src[t:] + b = b[:len(a)] + end := int32((len(a) >> 3) << 3) + for i := int32(0); i < end; i += 8 { + if diff := load6432(a, i) ^ load6432(b, i); diff != 0 { + return i + int32(bits.TrailingZeros64(diff)>>3) + } + } - // Extend the match to be as long as possible. - return int32(matchLen(src[s:], src[t:])) + a = a[end:] + b = b[end:] + for i := range a { + if a[i] != b[i] { + return int32(i) + end + } + } + return int32(len(a)) + end } // Reset the encoding table. diff --git a/zstd/enc_fast.go b/zstd/enc_fast.go index f2502629bc..5f08a28302 100644 --- a/zstd/enc_fast.go +++ b/zstd/enc_fast.go @@ -6,8 +6,6 @@ package zstd import ( "fmt" - "math" - "math/bits" ) const ( @@ -136,20 +134,7 @@ encodeLoop: // Consider history as well. var seq seq var length int32 - // length = 4 + e.matchlen(s+6, repIndex+4, src) - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } - + length = 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) // We might be able to match backwards. @@ -236,20 +221,7 @@ encodeLoop: } // Extend the 4-byte match as long as possible. - //l := e.matchlen(s+4, t+4, src) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -286,20 +258,7 @@ encodeLoop: if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlen(s+4, o2+4, src) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) @@ -418,21 +377,7 @@ encodeLoop: if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) { // Consider history as well. var seq seq - // length := 4 + e.matchlen(s+6, repIndex+4, src) - // length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:])) - var length int32 - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + length := 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) @@ -522,21 +467,7 @@ encodeLoop: panic(fmt.Sprintf("t (%d) < 0 ", t)) } // Extend the 4-byte match as long as possible. - //l := e.matchlenNoHist(s+4, t+4, src) + 4 - // l := int32(matchLen(src[s+4:], src[t+4:])) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -573,21 +504,7 @@ encodeLoop: if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlenNoHist(s+4, o2+4, src) - // l := 4 + int32(matchLen(src[s+4:], src[o2+4:])) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) @@ -731,19 +648,7 @@ encodeLoop: // Consider history as well. var seq seq var length int32 - // length = 4 + e.matchlen(s+6, repIndex+4, src) - { - a := src[s+6:] - b := src[repIndex+4:] - endI := len(a) & (math.MaxInt32 - 7) - length = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + length = 4 + e.matchlen(s+6, repIndex+4, src) seq.matchLen = uint32(length - zstdMinMatch) @@ -831,20 +736,7 @@ encodeLoop: } // Extend the 4-byte match as long as possible. - //l := e.matchlen(s+4, t+4, src) + 4 - var l int32 - { - a := src[s+4:] - b := src[t+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := e.matchlen(s+4, t+4, src) + 4 // Extend backwards tMin := s - e.maxMatchOff @@ -881,20 +773,7 @@ encodeLoop: if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match - //l := 4 + e.matchlen(s+4, o2+4, src) - var l int32 - { - a := src[s+4:] - b := src[o2+4:] - endI := len(a) & (math.MaxInt32 - 7) - l = int32(endI) + 4 - for i := 0; i < endI; i += 8 { - if diff := load64(a, i) ^ load64(b, i); diff != 0 { - l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 - break - } - } - } + l := 4 + e.matchlen(s+4, o2+4, src) // Store this, since we have it. nextHash := hashLen(cv, hashLog, tableFastHashLen) diff --git a/zstd/encoder_test.go b/zstd/encoder_test.go index a1c36d16b2..b437225e4f 100644 --- a/zstd/encoder_test.go +++ b/zstd/encoder_test.go @@ -51,7 +51,7 @@ func getEncOpts(cMax int) []testEncOpt { addOpt("nolit", WithNoEntropyCompression(true)) addOpt("pad1k", WithEncoderPadding(1024)) addOpt("zerof", WithZeroFrames(true)) - addOpt("singleseg", WithSingleSegment(true)) + addOpt("1seg", WithSingleSegment(true)) } if testing.Short() && conc == 2 { break @@ -904,10 +904,10 @@ func BenchmarkEncoder_EncodeAllXML(b *testing.B) { } dec.Close() - enc := Encoder{} + enc, _ := NewWriter(nil, WithEncoderConcurrency(1)) dst := enc.EncodeAll(in, nil) wantSize := len(dst) - b.Log("Output size:", len(dst)) + //b.Log("Output size:", len(dst)) b.ResetTimer() b.ReportAllocs() b.SetBytes(int64(len(in))) @@ -994,7 +994,7 @@ func BenchmarkEncoder_EncodeAllHTML(b *testing.B) { b.Fatal(err) } - enc := Encoder{} + enc, _ := NewWriter(nil, WithEncoderConcurrency(1)) dst := enc.EncodeAll(in, nil) wantSize := len(dst) b.ResetTimer() @@ -1018,7 +1018,7 @@ func BenchmarkEncoder_EncodeAllTwain(b *testing.B) { b.Fatal(err) } - enc := Encoder{} + enc, _ := NewWriter(nil, WithEncoderConcurrency(1)) dst := enc.EncodeAll(in, nil) wantSize := len(dst) b.ResetTimer() @@ -1042,7 +1042,7 @@ func BenchmarkEncoder_EncodeAllPi(b *testing.B) { b.Fatal(err) } - enc := Encoder{} + enc, _ := NewWriter(nil, WithEncoderConcurrency(1)) dst := enc.EncodeAll(in, nil) wantSize := len(dst) b.ResetTimer() diff --git a/zstd/fse_encoder.go b/zstd/fse_encoder.go index b4757ee3f0..5442061b18 100644 --- a/zstd/fse_encoder.go +++ b/zstd/fse_encoder.go @@ -62,9 +62,8 @@ func (s symbolTransform) String() string { // To indicate that you have populated the histogram call HistogramFinished // with the value of the highest populated symbol, as well as the number of entries // in the most populated entry. These are accepted at face value. -// The returned slice will always be length 256. -func (s *fseEncoder) Histogram() []uint32 { - return s.count[:] +func (s *fseEncoder) Histogram() *[256]uint32 { + return &s.count } // HistogramFinished can be called to indicate that the histogram has been populated.