From e8bafcbcaea94d7bea229582a630d3d380bdcf44 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 4 Aug 2020 16:02:08 +0200 Subject: [PATCH 1/2] inflate: Limit variable shifts Use and operations to speed up variable shifts. Faster on AMD64: ``` benchmark old ns/op new ns/op delta BenchmarkDecodeDigitsSpeed1e4-32 57027 56892 -0.24% BenchmarkDecodeDigitsSpeed1e5-32 657866 650408 -1.13% BenchmarkDecodeDigitsSpeed1e6-32 6679774 6425893 -3.80% BenchmarkDecodeDigitsDefault1e4-32 62810 61858 -1.52% BenchmarkDecodeDigitsDefault1e5-32 657865 628677 -4.44% BenchmarkDecodeDigitsDefault1e6-32 6486343 6211232 -4.24% BenchmarkDecodeDigitsCompress1e4-32 62169 61555 -0.99% BenchmarkDecodeDigitsCompress1e5-32 677789 668714 -1.34% BenchmarkDecodeDigitsCompress1e6-32 6851431 6685226 -2.43% BenchmarkDecodeTwainSpeed1e4-32 60606 59003 -2.64% BenchmarkDecodeTwainSpeed1e5-32 628151 609357 -2.99% BenchmarkDecodeTwainSpeed1e6-32 6238098 6015035 -3.58% BenchmarkDecodeTwainDefault1e4-32 59901 59167 -1.23% BenchmarkDecodeTwainDefault1e5-32 576772 561311 -2.68% BenchmarkDecodeTwainDefault1e6-32 5701418 5479259 -3.90% BenchmarkDecodeTwainCompress1e4-32 58582 56825 -3.00% BenchmarkDecodeTwainCompress1e5-32 535572 515826 -3.69% BenchmarkDecodeTwainCompress1e6-32 5265486 5090632 -3.32% BenchmarkDecodeRandomSpeed1e4-32 323 319 -1.24% BenchmarkDecodeRandomSpeed1e5-32 1954 1945 -0.46% BenchmarkDecodeRandomSpeed1e6-32 20016 20026 +0.05% benchmark old MB/s new MB/s speedup BenchmarkDecodeDigitsSpeed1e4-32 175.35 175.77 1.00x BenchmarkDecodeDigitsSpeed1e5-32 152.01 153.75 1.01x BenchmarkDecodeDigitsSpeed1e6-32 149.71 155.62 1.04x BenchmarkDecodeDigitsDefault1e4-32 159.21 161.66 1.02x BenchmarkDecodeDigitsDefault1e5-32 152.01 159.06 1.05x BenchmarkDecodeDigitsDefault1e6-32 154.17 161.00 1.04x BenchmarkDecodeDigitsCompress1e4-32 160.85 162.46 1.01x BenchmarkDecodeDigitsCompress1e5-32 147.54 149.54 1.01x BenchmarkDecodeDigitsCompress1e6-32 145.95 149.58 1.02x BenchmarkDecodeTwainSpeed1e4-32 165.00 169.48 1.03x BenchmarkDecodeTwainSpeed1e5-32 159.20 164.11 1.03x BenchmarkDecodeTwainSpeed1e6-32 160.31 166.25 1.04x BenchmarkDecodeTwainDefault1e4-32 166.94 169.01 1.01x BenchmarkDecodeTwainDefault1e5-32 173.38 178.15 1.03x BenchmarkDecodeTwainDefault1e6-32 175.39 182.51 1.04x BenchmarkDecodeTwainCompress1e4-32 170.70 175.98 1.03x BenchmarkDecodeTwainCompress1e5-32 186.72 193.86 1.04x BenchmarkDecodeTwainCompress1e6-32 189.92 196.44 1.03x BenchmarkDecodeRandomSpeed1e4-32 30915.66 31375.28 1.01x BenchmarkDecodeRandomSpeed1e5-32 51177.19 51408.19 1.00x BenchmarkDecodeRandomSpeed1e6-32 49958.99 49936.11 1.00x ``` --- flate/gen_inflate.go | 28 ++++++----- flate/inflate.go | 26 +++++----- flate/inflate_gen.go | 112 +++++++++++++++++++++++-------------------- 3 files changed, 89 insertions(+), 77 deletions(-) diff --git a/flate/gen_inflate.go b/flate/gen_inflate.go index c74a95fe7f..847d208964 100644 --- a/flate/gen_inflate.go +++ b/flate/gen_inflate.go @@ -167,15 +167,15 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -183,17 +183,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -202,9 +204,9 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -225,7 +227,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -233,7 +235,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } diff --git a/flate/inflate.go b/flate/inflate.go index 3e4259f157..2c755cbb5d 100644 --- a/flate/inflate.go +++ b/flate/inflate.go @@ -685,12 +685,12 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { if err = f.moreBits(); err != nil { @@ -701,17 +701,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -720,7 +722,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -730,10 +732,10 @@ readLiteral: return } } - extra |= int(f.b & uint32(1<>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -743,7 +745,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -751,7 +753,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } @@ -869,7 +871,7 @@ func (f *decompressor) moreBits() error { return noEOF(err) } f.roffset++ - f.b |= uint32(c) << f.nb + f.b |= uint32(c) << (f.nb & 31) f.nb += 8 return nil } diff --git a/flate/inflate_gen.go b/flate/inflate_gen.go index 397dc1b1a1..b72718e870 100644 --- a/flate/inflate_gen.go +++ b/flate/inflate_gen.go @@ -145,15 +145,15 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -161,17 +161,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -180,9 +182,9 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -203,7 +205,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -211,7 +213,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } @@ -369,15 +371,15 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -385,17 +387,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -404,9 +408,9 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -427,7 +431,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -435,7 +439,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } @@ -593,15 +597,15 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -609,17 +613,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -628,9 +634,9 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -651,7 +657,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -659,7 +665,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } @@ -817,15 +823,15 @@ readLiteral: return } } - length += int(f.b & uint32(1<>= n + length += int(f.b & uint32(1<<(n&31)-1)) + f.b >>= n & 31 f.nb -= n } - var dist int + var dist uint32 if f.hd == nil { for f.nb < 5 { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb<5:", err) } @@ -833,17 +839,19 @@ readLiteral: return } } - dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3))) + dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3))) f.b >>= 5 f.nb -= 5 } else { - if dist, err = f.huffSym(f.hd); err != nil { + sym, err := f.huffSym(f.hd) + if err != nil { if debugDecode { fmt.Println("huffsym:", err) } f.err = err return } + dist = uint32(sym) } switch { @@ -852,9 +860,9 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << nb + extra := (dist & 1) << (nb & 31) for f.nb < nb { - if err = moreBits(); err != nil { + if err = f.moreBits(); err != nil { if debugDecode { fmt.Println("morebits f.nb>= nb + extra |= f.b & uint32(1<<(nb&31)-1) + f.b >>= nb & 31 f.nb -= nb - dist = 1<<(nb+1) + 1 + extra + dist = 1<<((nb+1)&31) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -875,7 +883,7 @@ readLiteral: } // No check on length; encoding can be prescient. - if dist > f.dict.histSize() { + if dist > uint32(f.dict.histSize()) { if debugDecode { fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize()) } @@ -883,7 +891,7 @@ readLiteral: return } - f.copyLen, f.copyDist = length, dist + f.copyLen, f.copyDist = length, int(dist) goto copyHistory } From 7823bb634a526fa13a4ebda04262888bae06accb Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 4 Aug 2020 17:25:59 +0200 Subject: [PATCH 2/2] Avoid regressing other platforms. --- flate/fast_encoder.go | 10 +++--- flate/gen_inflate.go | 16 +++++----- flate/huffman_bit_writer.go | 8 ++--- flate/inflate.go | 26 +++++++-------- flate/inflate_gen.go | 64 ++++++++++++++++++------------------- flate/regmask_amd64.go | 37 +++++++++++++++++++++ flate/regmask_other.go | 39 ++++++++++++++++++++++ 7 files changed, 138 insertions(+), 62 deletions(-) create mode 100644 flate/regmask_amd64.go create mode 100644 flate/regmask_other.go diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go index 6d4c1e98bc..4a73e1bdd3 100644 --- a/flate/fast_encoder.go +++ b/flate/fast_encoder.go @@ -127,7 +127,7 @@ func (e *fastGen) addBlock(src []byte) int32 { // hash4 returns the hash of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <32. func hash4u(u uint32, h uint8) uint32 { - return (u * prime4bytes) >> ((32 - h) & 31) + return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32) } type tableEntryPrev struct { @@ -138,25 +138,25 @@ type tableEntryPrev struct { // hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <32. func hash4x64(u uint64, h uint8) uint32 { - return (uint32(u) * prime4bytes) >> ((32 - h) & 31) + return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32) } // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <64. func hash7(u uint64, h uint8) uint32 { - return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63)) + return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64)) } // hash8 returns the hash of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <64. func hash8(u uint64, h uint8) uint32 { - return uint32((u * prime8bytes) >> ((64 - h) & 63)) + return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64)) } // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <64. func hash6(u uint64, h uint8) uint32 { - return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63)) + return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64)) } // matchlen will return the match length between offsets and t in src. diff --git a/flate/gen_inflate.go b/flate/gen_inflate.go index 847d208964..b26d19ec25 100644 --- a/flate/gen_inflate.go +++ b/flate/gen_inflate.go @@ -85,7 +85,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -104,7 +104,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -167,8 +167,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -204,7 +204,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -214,10 +214,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 53fe1d06e2..208d66711d 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -206,7 +206,7 @@ func (w *huffmanBitWriter) write(b []byte) { } func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { - w.bits |= uint64(b) << (w.nbits & 63) + w.bits |= uint64(b) << (w.nbits & reg16SizeMask64) w.nbits += nb if w.nbits >= 48 { w.writeOutBits() @@ -759,7 +759,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := lengths[lengthCode&31] - w.bits |= uint64(c.code) << (w.nbits & 63) + w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64) w.nbits += c.len if w.nbits >= 48 { w.writeOutBits() @@ -779,7 +779,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := offs[offsetCode&31] - w.bits |= uint64(c.code) << (w.nbits & 63) + w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64) w.nbits += c.len if w.nbits >= 48 { w.writeOutBits() @@ -878,7 +878,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - w.bits |= uint64(c.code) << ((w.nbits) & 63) + w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64) w.nbits += c.len if w.nbits >= 48 { bits := w.bits diff --git a/flate/inflate.go b/flate/inflate.go index 2c755cbb5d..189e9fe0b0 100644 --- a/flate/inflate.go +++ b/flate/inflate.go @@ -522,8 +522,8 @@ func (f *decompressor) readHuffman() error { return err } } - rep += int(f.b & uint32(1<>= nb + rep += int(f.b & uint32(1<<(nb®SizeMaskUint32)-1)) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb if i+rep > n { if debugDecode { @@ -603,7 +603,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -622,7 +622,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -685,8 +685,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -722,7 +722,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -732,10 +732,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -871,7 +871,7 @@ func (f *decompressor) moreBits() error { return noEOF(err) } f.roffset++ - f.b |= uint32(c) << (f.nb & 31) + f.b |= uint32(c) << (f.nb & regSizeMaskUint32) f.nb += 8 return nil } @@ -896,7 +896,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { return 0, noEOF(err) } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := h.chunks[b&(huffmanNumChunks-1)] @@ -915,7 +915,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) { f.err = CorruptInputError(f.roffset) return 0, f.err } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n return int(chunk >> huffmanValueShift), nil } diff --git a/flate/inflate_gen.go b/flate/inflate_gen.go index b72718e870..9a92a1b302 100644 --- a/flate/inflate_gen.go +++ b/flate/inflate_gen.go @@ -63,7 +63,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -82,7 +82,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -145,8 +145,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -182,7 +182,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -192,10 +192,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -289,7 +289,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -308,7 +308,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -371,8 +371,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -408,7 +408,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -418,10 +418,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -515,7 +515,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -534,7 +534,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -597,8 +597,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -634,7 +634,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -644,10 +644,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) @@ -741,7 +741,7 @@ readLiteral: return } f.roffset++ - b |= uint32(c) << (nb & 31) + b |= uint32(c) << (nb & regSizeMaskUint32) nb += 8 } chunk := f.hl.chunks[b&(huffmanNumChunks-1)] @@ -760,7 +760,7 @@ readLiteral: f.err = CorruptInputError(f.roffset) return } - f.b = b >> (n & 31) + f.b = b >> (n & regSizeMaskUint32) f.nb = nb - n v = int(chunk >> huffmanValueShift) break @@ -823,8 +823,8 @@ readLiteral: return } } - length += int(f.b & uint32(1<<(n&31)-1)) - f.b >>= n & 31 + length += int(f.b & uint32(1<<(n®SizeMaskUint32)-1)) + f.b >>= n & regSizeMaskUint32 f.nb -= n } @@ -860,7 +860,7 @@ readLiteral: case dist < maxNumDist: nb := uint(dist-2) >> 1 // have 1 bit in bottom of dist, need nb more. - extra := (dist & 1) << (nb & 31) + extra := (dist & 1) << (nb & regSizeMaskUint32) for f.nb < nb { if err = f.moreBits(); err != nil { if debugDecode { @@ -870,10 +870,10 @@ readLiteral: return } } - extra |= f.b & uint32(1<<(nb&31)-1) - f.b >>= nb & 31 + extra |= f.b & uint32(1<<(nb®SizeMaskUint32)-1) + f.b >>= nb & regSizeMaskUint32 f.nb -= nb - dist = 1<<((nb+1)&31) + 1 + extra + dist = 1<<((nb+1)®SizeMaskUint32) + 1 + extra default: if debugDecode { fmt.Println("dist too big:", dist, maxNumDist) diff --git a/flate/regmask_amd64.go b/flate/regmask_amd64.go new file mode 100644 index 0000000000..6ed28061b2 --- /dev/null +++ b/flate/regmask_amd64.go @@ -0,0 +1,37 @@ +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMaskX - shift value is 8 bits, shifted is X + reg8SizeMask8 = 7 + reg8SizeMask16 = 15 + reg8SizeMask32 = 31 + reg8SizeMask64 = 63 + + // reg16SizeMaskX - shift value is 16 bits, shifted is X + reg16SizeMask8 = reg8SizeMask8 + reg16SizeMask16 = reg8SizeMask16 + reg16SizeMask32 = reg8SizeMask32 + reg16SizeMask64 = reg8SizeMask64 + + // reg32SizeMaskX - shift value is 32 bits, shifted is X + reg32SizeMask8 = reg8SizeMask8 + reg32SizeMask16 = reg8SizeMask16 + reg32SizeMask32 = reg8SizeMask32 + reg32SizeMask64 = reg8SizeMask64 + + // reg64SizeMaskX - shift value is 64 bits, shifted is X + reg64SizeMask8 = reg8SizeMask8 + reg64SizeMask16 = reg8SizeMask16 + reg64SizeMask32 = reg8SizeMask32 + reg64SizeMask64 = reg8SizeMask64 + + // regSizeMaskUintX - shift value is uint, shifted is X + regSizeMaskUint8 = reg8SizeMask8 + regSizeMaskUint16 = reg8SizeMask16 + regSizeMaskUint32 = reg8SizeMask32 + regSizeMaskUint64 = reg8SizeMask64 +) diff --git a/flate/regmask_other.go b/flate/regmask_other.go new file mode 100644 index 0000000000..f477a5d6e5 --- /dev/null +++ b/flate/regmask_other.go @@ -0,0 +1,39 @@ +//+build !amd64 + +package flate + +const ( + // Masks for shifts with register sizes of the shift value. + // This can be used to work around the x86 design of shifting by mod register size. + // It can be used when a variable shift is always smaller than the register size. + + // reg8SizeMaskX - shift value is 8 bits, shifted is X + reg8SizeMask8 = 0xff + reg8SizeMask16 = 0xff + reg8SizeMask32 = 0xff + reg8SizeMask64 = 0xff + + // reg16SizeMaskX - shift value is 16 bits, shifted is X + reg16SizeMask8 = 0xffff + reg16SizeMask16 = 0xffff + reg16SizeMask32 = 0xffff + reg16SizeMask64 = 0xffff + + // reg32SizeMaskX - shift value is 32 bits, shifted is X + reg32SizeMask8 = 0xffffffff + reg32SizeMask16 = 0xffffffff + reg32SizeMask32 = 0xffffffff + reg32SizeMask64 = 0xffffffff + + // reg64SizeMaskX - shift value is 64 bits, shifted is X + reg64SizeMask8 = 0xffffffffffffffff + reg64SizeMask16 = 0xffffffffffffffff + reg64SizeMask32 = 0xffffffffffffffff + reg64SizeMask64 = 0xffffffffffffffff + + // regSizeMaskUintX - shift value is uint, shifted is X + regSizeMaskUint8 = ^uint(0) + regSizeMaskUint16 = ^uint(0) + regSizeMaskUint32 = ^uint(0) + regSizeMaskUint64 = ^uint(0) +)