Skip to content

Commit

Permalink
Use buffers for 1X decodes (#500)
Browse files Browse the repository at this point in the history
Also make buffers safe from concurrent decoder use.

```
benchmark                                            old ns/op     new ns/op     delta
BenchmarkDecompress1XTable/digits-32                 256367        262489        +2.39%
BenchmarkDecompress1XTable/gettysburg-32             5189          5224          +0.67%
BenchmarkDecompress1XTable/twain-32                  825950        825534        -0.05%
BenchmarkDecompress1XTable/low-ent.10k-32            88050         88429         +0.43%
BenchmarkDecompress1XTable/superlow-ent-10k-32       23420         23456         +0.15%
BenchmarkDecompress1XTable/crash2-32                 687           676           -1.59%
BenchmarkDecompress1XTable/endzerobits-32            80.9          78.4          -3.07%
BenchmarkDecompress1XTable/endnonzero-32             494           511           +3.40%
BenchmarkDecompress1XTable/case1-32                  1948          1924          -1.23%
BenchmarkDecompress1XTable/case2-32                  1916          1886          -1.57%
BenchmarkDecompress1XTable/case3-32                  1945          1921          -1.23%
BenchmarkDecompress1XTable/pngdata.001-32            126909        125848        -0.84%
BenchmarkDecompress1XTable/normcount2-32             1285          1295          +0.78%
BenchmarkDecompress1XNoTable/digits-32               255503        261690        +2.42%
BenchmarkDecompress1XNoTable/gettysburg-32           4029          4014          -0.37%
BenchmarkDecompress1XNoTable/twain-32                823710        821251        -0.30%
BenchmarkDecompress1XNoTable/low-ent.10k-32          87024         87182         +0.18%
BenchmarkDecompress1XNoTable/superlow-ent-10k-32     22812         23353         +2.37%
BenchmarkDecompress1XNoTable/crash2-32               82.6          67.2          -18.62%
BenchmarkDecompress1XNoTable/endzerobits-32          54.3          43.8          -19.47%
BenchmarkDecompress1XNoTable/endnonzero-32           59.6          46.6          -21.86%
BenchmarkDecompress1XNoTable/case1-32                179           159           -11.23%
BenchmarkDecompress1XNoTable/case2-32                144           128           -10.71%
BenchmarkDecompress1XNoTable/case3-32                165           145           -12.29%
BenchmarkDecompress1XNoTable/pngdata.001-32          123734        123297        -0.35%
BenchmarkDecompress1XNoTable/normcount2-32           248           241           -3.14%
BenchmarkDecompress4XNoTable/digits-32               152812        151641        -0.77%
BenchmarkDecompress4XNoTable/gettysburg-32           2585          2712          +4.91%
BenchmarkDecompress4XNoTable/twain-32                529935        550282        +3.84%
BenchmarkDecompress4XNoTable/low-ent.10k-32          53602         52664         -1.75%
BenchmarkDecompress4XNoTable/superlow-ent-10k-32     14375         14054         -2.23%
BenchmarkDecompress4XNoTable/case1-32                254           226           -10.84%
BenchmarkDecompress4XNoTable/case2-32                207           182           -12.22%
BenchmarkDecompress4XNoTable/case3-32                215           186           -13.14%
BenchmarkDecompress4XNoTable/pngdata.001-32          73031         76067         +4.16%
BenchmarkDecompress4XNoTable/normcount2-32           309           280           -9.38%
BenchmarkDecompress4XNoTableTableLog8/digits-32      152307        150121        -1.44%
BenchmarkDecompress4XTable/digits-32                 152793        150602        -1.43%
BenchmarkDecompress4XTable/gettysburg-32             3861          3924          +1.63%
BenchmarkDecompress4XTable/twain-32                  536438        550964        +2.71%
BenchmarkDecompress4XTable/low-ent.10k-32            54465         53176         -2.37%
BenchmarkDecompress4XTable/superlow-ent-10k-32       14904         14677         -1.52%
BenchmarkDecompress4XTable/case1-32                  2000          2007          +0.35%
BenchmarkDecompress4XTable/case2-32                  1982          1968          -0.71%
BenchmarkDecompress4XTable/case3-32                  1992          2014          +1.10%
BenchmarkDecompress4XTable/pngdata.001-32            75929         79317         +4.46%
BenchmarkDecompress4XTable/normcount2-32             1372          1344          -2.04%

benchmark                                            old MB/s     new MB/s     speedup
BenchmarkDecompress1XTable/digits-32                 390.08       380.98       0.98x
BenchmarkDecompress1XTable/gettysburg-32             298.30       296.30       0.99x
BenchmarkDecompress1XTable/twain-32                  317.38       317.54       1.00x
BenchmarkDecompress1XTable/low-ent.10k-32            454.29       452.34       1.00x
BenchmarkDecompress1XTable/superlow-ent-10k-32       448.34       447.66       1.00x
BenchmarkDecompress1XTable/crash2-32                 21.84        22.19        1.02x
BenchmarkDecompress1XTable/endzerobits-32            61.81        63.77        1.03x
BenchmarkDecompress1XTable/endnonzero-32             14.17        13.71        0.97x
BenchmarkDecompress1XTable/case1-32                  28.23        28.59        1.01x
BenchmarkDecompress1XTable/case2-32                  23.49        23.86        1.02x
BenchmarkDecompress1XTable/case3-32                  24.67        24.99        1.01x
BenchmarkDecompress1XTable/pngdata.001-32            403.44       406.84       1.01x
BenchmarkDecompress1XTable/normcount2-32             67.71        67.18        0.99x
BenchmarkDecompress1XNoTable/digits-32               391.40       382.14       0.98x
BenchmarkDecompress1XNoTable/gettysburg-32           384.26       385.65       1.00x
BenchmarkDecompress1XNoTable/twain-32                318.25       319.20       1.00x
BenchmarkDecompress1XNoTable/low-ent.10k-32          459.65       458.81       1.00x
BenchmarkDecompress1XNoTable/superlow-ent-10k-32     460.29       449.62       0.98x
BenchmarkDecompress1XNoTable/crash2-32               181.62       223.18       1.23x
BenchmarkDecompress1XNoTable/endzerobits-32          92.04        114.29       1.24x
BenchmarkDecompress1XNoTable/endnonzero-32           117.43       150.28       1.28x
BenchmarkDecompress1XNoTable/case1-32                307.35       346.21       1.13x
BenchmarkDecompress1XNoTable/case2-32                313.02       350.41       1.12x
BenchmarkDecompress1XNoTable/case3-32                290.54       331.28       1.14x
BenchmarkDecompress1XNoTable/pngdata.001-32          413.79       415.26       1.00x
BenchmarkDecompress1XNoTable/normcount2-32           350.06       361.51       1.03x
BenchmarkDecompress4XNoTable/digits-32               654.42       659.47       1.01x
BenchmarkDecompress4XNoTable/gettysburg-32           598.78       570.81       0.95x
BenchmarkDecompress4XNoTable/twain-32                494.67       476.38       0.96x
BenchmarkDecompress4XNoTable/low-ent.10k-32          746.25       759.53       1.02x
BenchmarkDecompress4XNoTable/superlow-ent-10k-32     730.46       747.10       1.02x
BenchmarkDecompress4XNoTable/case1-32                216.72       243.09       1.12x
BenchmarkDecompress4XNoTable/case2-32                217.24       247.59       1.14x
BenchmarkDecompress4XNoTable/case3-32                223.64       257.53       1.15x
BenchmarkDecompress4XNoTable/pngdata.001-32          701.07       673.09       0.96x
BenchmarkDecompress4XNoTable/normcount2-32           281.26       310.40       1.10x
BenchmarkDecompress4XNoTableTableLog8/digits-32      656.59       666.15       1.01x
BenchmarkDecompress4XTable/digits-32                 654.50       664.02       1.01x
BenchmarkDecompress4XTable/gettysburg-32             400.94       394.53       0.98x
BenchmarkDecompress4XTable/twain-32                  488.67       475.79       0.97x
BenchmarkDecompress4XTable/low-ent.10k-32            734.42       752.22       1.02x
BenchmarkDecompress4XTable/superlow-ent-10k-32       704.50       715.40       1.02x
BenchmarkDecompress4XTable/case1-32                  27.51        27.41        1.00x
BenchmarkDecompress4XTable/case2-32                  22.71        22.87        1.01x
BenchmarkDecompress4XTable/case3-32                  24.09        23.84        0.99x
BenchmarkDecompress4XTable/pngdata.001-32            674.31       645.51       0.96x
BenchmarkDecompress4XTable/normcount2-32             63.43        64.73        1.02x
```
  • Loading branch information
klauspost authored Feb 21, 2022
1 parent 8949d94 commit 910cf16
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 7 deletions.
65 changes: 58 additions & 7 deletions huff0/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"io"
"sync"

"github.com/klauspost/compress/fse"
)
Expand Down Expand Up @@ -216,14 +217,23 @@ func (s *Scratch) Decoder() *Decoder {
return &Decoder{
dt: s.dt,
actualTableLog: s.actualTableLog,
bufs: &s.decPool,
}
}

// Decoder provides stateless decoding.
type Decoder struct {
dt dTable
actualTableLog uint8
buf [4][256]byte
bufs *sync.Pool
}

func (d *Decoder) buffer() *[4][256]byte {
buf, ok := d.bufs.Get().(*[4][256]byte)
if ok {
return buf
}
return &[4][256]byte{}
}

// Decompress1X will decompress a 1X encoded stream.
Expand All @@ -250,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
var buf [256]byte
bufs := d.buffer()
buf := &bufs[0]
var off uint8

for br.off >= 8 {
Expand Down Expand Up @@ -278,13 +289,15 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
}
}

if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -311,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
}
}
if len(dst) >= maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -320,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}

Expand All @@ -342,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:256]

// Use temp table to avoid bound checks/append penalty.
var buf [256]byte
bufs := d.buffer()
buf := &bufs[0]
var off uint8

switch d.actualTableLog {
Expand Down Expand Up @@ -370,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
Expand Down Expand Up @@ -399,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
Expand Down Expand Up @@ -427,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand Down Expand Up @@ -456,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand Down Expand Up @@ -485,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand Down Expand Up @@ -514,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand Down Expand Up @@ -543,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand Down Expand Up @@ -572,17 +595,20 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
}
}
default:
d.bufs.Put(bufs)
return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
}

if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -602,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
}
if len(dst) >= maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
v := dt[br.peekByteFast()>>shift]
Expand All @@ -610,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}

Expand All @@ -629,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
dt := d.dt.single[:256]

// Use temp table to avoid bound checks/append penalty.
var buf [256]byte
bufs := d.buffer()
buf := &bufs[0]
var off uint8

const shift = 56
Expand All @@ -656,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -664,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
}

if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -680,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
}
}
if len(dst) >= maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
Expand All @@ -689,6 +721,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}

Expand Down Expand Up @@ -736,7 +769,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
single := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
buf := &d.buf
buf := d.buffer()
var off uint8
var decoded int

Expand Down Expand Up @@ -801,6 +834,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {

if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
Expand All @@ -811,13 +845,15 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
Expand Down Expand Up @@ -853,6 +889,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
// end inline...
if offset >= len(out) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}

Expand All @@ -871,6 +908,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
Expand Down Expand Up @@ -916,7 +954,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
single := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
buf := &d.buf
buf := d.buffer()
var off uint8
var decoded int

Expand Down Expand Up @@ -1022,6 +1060,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {

if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
Expand All @@ -1032,13 +1071,15 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
Expand All @@ -1056,6 +1097,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
bitsLeft := int(br.off*8) + int(64-br.bitsRead)
for bitsLeft > 0 {
if br.finished() {
d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
Expand All @@ -1076,6 +1118,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
}
// end inline...
if offset >= len(out) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}

Expand All @@ -1090,9 +1133,11 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
d.bufs.Put(buf)
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
Expand Down Expand Up @@ -1134,7 +1179,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
single := d.dt.single[:tlSize]

// Use temp table to avoid bound checks/append penalty.
buf := &d.buf
buf := d.buffer()
var off uint8
var decoded int

Expand Down Expand Up @@ -1240,6 +1285,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {

if off == 0 {
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
Expand All @@ -1250,6 +1296,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
Expand All @@ -1274,6 +1321,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
bitsLeft := int(br.off*8) + int(64-br.bitsRead)
for bitsLeft > 0 {
if br.finished() {
d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
Expand All @@ -1294,6 +1342,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
}
// end inline...
if offset >= len(out) {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}

Expand All @@ -1308,9 +1357,11 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
d.bufs.Put(buf)
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
Expand Down
2 changes: 2 additions & 0 deletions huff0/huff0.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"math"
"math/bits"
"sync"

"github.com/klauspost/compress/fse"
)
Expand Down Expand Up @@ -116,6 +117,7 @@ type Scratch struct {
nodes []nodeElt
tmpOut [4][]byte
fse *fse.Scratch
decPool sync.Pool // *[4][256]byte buffers.
huffWeight [maxSymbolValue + 1]byte
}

Expand Down

0 comments on commit 910cf16

Please sign in to comment.