diff --git a/vendor/github.com/klauspost/crc32/README.md b/vendor/github.com/klauspost/crc32/README.md index 440541c7ff3..029625d3609 100644 --- a/vendor/github.com/klauspost/crc32/README.md +++ b/vendor/github.com/klauspost/crc32/README.md @@ -12,12 +12,15 @@ Install using `go get github.com/klauspost/crc32`. This library is based on Go 1 Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. # changes - +* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match. * Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. # performance +For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back. + + For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: ``` benchmark old ns/op new ns/op delta diff --git a/vendor/github.com/klauspost/crc32/crc32.go b/vendor/github.com/klauspost/crc32/crc32.go index 8d6ba5d3dc5..8aa91b17e90 100644 --- a/vendor/github.com/klauspost/crc32/crc32.go +++ b/vendor/github.com/klauspost/crc32/crc32.go @@ -40,71 +40,96 @@ const ( // Table is a 256-word table representing the polynomial for efficient processing. type Table [256]uint32 +// This file makes use of functions implemented in architecture-specific files. +// The interface that they implement is as follows: +// +// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE +// // algorithm is available. +// archAvailableIEEE() bool +// +// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm. +// // It can only be called if archAvailableIEEE() returns true. +// archInitIEEE() +// +// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if +// // archInitIEEE() was previously called. +// archUpdateIEEE(crc uint32, p []byte) uint32 +// +// // archAvailableCastagnoli reports whether an architecture-specific +// // CRC32-C algorithm is available. +// archAvailableCastagnoli() bool +// +// // archInitCastagnoli initializes the architecture-specific CRC32-C +// // algorithm. It can only be called if archAvailableCastagnoli() returns +// // true. +// archInitCastagnoli() +// +// // archUpdateCastagnoli updates the given CRC32-C. It can only be called +// // if archInitCastagnoli() was previously called. +// archUpdateCastagnoli(crc uint32, p []byte) uint32 + // castagnoliTable points to a lazily initialized Table for the Castagnoli // polynomial. MakeTable will always return this value when asked to make a // Castagnoli table so we can compare against it to find when the caller is // using this polynomial. var castagnoliTable *Table var castagnoliTable8 *slicing8Table +var castagnoliArchImpl bool +var updateCastagnoli func(crc uint32, p []byte) uint32 var castagnoliOnce sync.Once func castagnoliInit() { - castagnoliTable = makeTable(Castagnoli) - castagnoliTable8 = makeTable8(Castagnoli) + castagnoliTable = simpleMakeTable(Castagnoli) + castagnoliArchImpl = archAvailableCastagnoli() + + if castagnoliArchImpl { + archInitCastagnoli() + updateCastagnoli = archUpdateCastagnoli + } else { + // Initialize the slicing-by-8 table. + castagnoliTable8 = slicingMakeTable(Castagnoli) + updateCastagnoli = func(crc uint32, p []byte) uint32 { + return slicingUpdate(crc, castagnoliTable8, p) + } + } } // IEEETable is the table for the IEEE polynomial. -var IEEETable = makeTable(IEEE) - -// slicing8Table is array of 8 Tables -type slicing8Table [8]Table +var IEEETable = simpleMakeTable(IEEE) // ieeeTable8 is the slicing8Table for IEEE var ieeeTable8 *slicing8Table -var ieeeTable8Once sync.Once +var ieeeArchImpl bool +var updateIEEE func(crc uint32, p []byte) uint32 +var ieeeOnce sync.Once + +func ieeeInit() { + ieeeArchImpl = archAvailableIEEE() + + if ieeeArchImpl { + archInitIEEE() + updateIEEE = archUpdateIEEE + } else { + // Initialize the slicing-by-8 table. + ieeeTable8 = slicingMakeTable(IEEE) + updateIEEE = func(crc uint32, p []byte) uint32 { + return slicingUpdate(crc, ieeeTable8, p) + } + } +} // MakeTable returns a Table constructed from the specified polynomial. // The contents of this Table must not be modified. func MakeTable(poly uint32) *Table { switch poly { case IEEE: + ieeeOnce.Do(ieeeInit) return IEEETable case Castagnoli: castagnoliOnce.Do(castagnoliInit) return castagnoliTable } - return makeTable(poly) -} - -// makeTable returns the Table constructed from the specified polynomial. -func makeTable(poly uint32) *Table { - t := new(Table) - for i := 0; i < 256; i++ { - crc := uint32(i) - for j := 0; j < 8; j++ { - if crc&1 == 1 { - crc = (crc >> 1) ^ poly - } else { - crc >>= 1 - } - } - t[i] = crc - } - return t -} - -// makeTable8 returns slicing8Table constructed from the specified polynomial. -func makeTable8(poly uint32) *slicing8Table { - t := new(slicing8Table) - t[0] = *makeTable(poly) - for i := 0; i < 256; i++ { - crc := t[0][i] - for j := 1; j < 8; j++ { - crc = t[0][crc&0xFF] ^ (crc >> 8) - t[j][i] = crc - } - } - return t + return simpleMakeTable(poly) } // digest represents the partial evaluation of a checksum. @@ -116,7 +141,12 @@ type digest struct { // New creates a new hash.Hash32 computing the CRC-32 checksum // using the polynomial represented by the Table. // Its Sum method will lay the value out in big-endian byte order. -func New(tab *Table) hash.Hash32 { return &digest{0, tab} } +func New(tab *Table) hash.Hash32 { + if tab == IEEETable { + ieeeOnce.Do(ieeeInit) + } + return &digest{0, tab} +} // NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum // using the IEEE polynomial. @@ -129,44 +159,32 @@ func (d *digest) BlockSize() int { return 1 } func (d *digest) Reset() { d.crc = 0 } -func update(crc uint32, tab *Table, p []byte) uint32 { - crc = ^crc - for _, v := range p { - crc = tab[byte(crc)^v] ^ (crc >> 8) - } - return ^crc -} - -// updateSlicingBy8 updates CRC using Slicing-by-8 -func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 { - crc = ^crc - for len(p) > 8 { - crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 - crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ - tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ - tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] - p = p[8:] - } - crc = ^crc - if len(p) == 0 { - return crc - } - return update(crc, &tab[0], p) -} - // Update returns the result of adding the bytes in p to the crc. func Update(crc uint32, tab *Table, p []byte) uint32 { - if tab == castagnoliTable { + switch tab { + case castagnoliTable: return updateCastagnoli(crc, p) - } - if tab == IEEETable { + case IEEETable: + // Unfortunately, because IEEETable is exported, IEEE may be used without a + // call to MakeTable. We have to make sure it gets initialized in that case. + ieeeOnce.Do(ieeeInit) return updateIEEE(crc, p) + default: + return simpleUpdate(crc, tab, p) } - return update(crc, tab, p) } func (d *digest) Write(p []byte) (n int, err error) { - d.crc = Update(d.crc, d.tab, p) + switch d.tab { + case castagnoliTable: + d.crc = updateCastagnoli(d.crc, p) + case IEEETable: + // We only create digest objects through New() which takes care of + // initialization in this case. + d.crc = updateIEEE(d.crc, p) + default: + d.crc = simpleUpdate(d.crc, d.tab, p) + } return len(p), nil } @@ -183,4 +201,7 @@ func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } // ChecksumIEEE returns the CRC-32 checksum of data // using the IEEE polynomial. -func ChecksumIEEE(data []byte) uint32 { return updateIEEE(0, data) } +func ChecksumIEEE(data []byte) uint32 { + ieeeOnce.Do(ieeeInit) + return updateIEEE(0, data) +} diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.go b/vendor/github.com/klauspost/crc32/crc32_amd64.go index 4827128ea04..af2a0b844bd 100644 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.go +++ b/vendor/github.com/klauspost/crc32/crc32_amd64.go @@ -4,8 +4,14 @@ // +build !appengine,!gccgo +// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a +// description of the interface that each architecture-specific file +// implements. + package crc32 +import "unsafe" + // This file contains the code to call the SSE 4.2 version of the Castagnoli // and IEEE CRC. @@ -15,11 +21,20 @@ func haveSSE41() bool func haveSSE42() bool func haveCLMUL() bool -// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32 // instruction. //go:noescape func castagnoliSSE42(crc uint32, p []byte) uint32 +// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32 +// instruction. +//go:noescape +func castagnoliSSE42Triple( + crcA, crcB, crcC uint32, + a, b, c []byte, + rounds uint32, +) (retA uint32, retB uint32, retC uint32) + // ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ // instruction as well as SSE 4.1. //go:noescape @@ -28,35 +43,188 @@ func ieeeCLMUL(crc uint32, p []byte) uint32 var sse42 = haveSSE42() var useFastIEEE = haveCLMUL() && haveSSE41() -func updateCastagnoli(crc uint32, p []byte) uint32 { - if sse42 { - return castagnoliSSE42(crc, p) +const castagnoliK1 = 168 +const castagnoliK2 = 1344 + +type sse42Table [4]Table + +var castagnoliSSE42TableK1 *sse42Table +var castagnoliSSE42TableK2 *sse42Table + +func archAvailableCastagnoli() bool { + return sse42 +} + +func archInitCastagnoli() { + if !sse42 { + panic("arch-specific Castagnoli not available") } - // only use slicing-by-8 when input is >= 16 Bytes - if len(p) >= 16 { - return updateSlicingBy8(crc, castagnoliTable8, p) + castagnoliSSE42TableK1 = new(sse42Table) + castagnoliSSE42TableK2 = new(sse42Table) + // See description in updateCastagnoli. + // t[0][i] = CRC(i000, O) + // t[1][i] = CRC(0i00, O) + // t[2][i] = CRC(00i0, O) + // t[3][i] = CRC(000i, O) + // where O is a sequence of K zeros. + var tmp [castagnoliK2]byte + for b := 0; b < 4; b++ { + for i := 0; i < 256; i++ { + val := uint32(i) << uint32(b*8) + castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1]) + castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:]) + } } - return update(crc, castagnoliTable, p) } -func updateIEEE(crc uint32, p []byte) uint32 { - if useFastIEEE && len(p) >= 64 { - left := len(p) & 15 - do := len(p) - left - crc = ^ieeeCLMUL(^crc, p[:do]) - if left > 0 { - crc = update(crc, IEEETable, p[do:]) +// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the +// table given) with the given initial crc value. This corresponds to +// CRC(crc, O) in the description in updateCastagnoli. +func castagnoliShift(table *sse42Table, crc uint32) uint32 { + return table[3][crc>>24] ^ + table[2][(crc>>16)&0xFF] ^ + table[1][(crc>>8)&0xFF] ^ + table[0][crc&0xFF] +} + +func archUpdateCastagnoli(crc uint32, p []byte) uint32 { + if !sse42 { + panic("not available") + } + + // This method is inspired from the algorithm in Intel's white paper: + // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + // The same strategy of splitting the buffer in three is used but the + // combining calculation is different; the complete derivation is explained + // below. + // + // -- The basic idea -- + // + // The CRC32 instruction (available in SSE4.2) can process 8 bytes at a + // time. In recent Intel architectures the instruction takes 3 cycles; + // however the processor can pipeline up to three instructions if they + // don't depend on each other. + // + // Roughly this means that we can process three buffers in about the same + // time we can process one buffer. + // + // The idea is then to split the buffer in three, CRC the three pieces + // separately and then combine the results. + // + // Combining the results requires precomputed tables, so we must choose a + // fixed buffer length to optimize. The longer the length, the faster; but + // only buffers longer than this length will use the optimization. We choose + // two cutoffs and compute tables for both: + // - one around 512: 168*3=504 + // - one around 4KB: 1344*3=4032 + // + // -- The nitty gritty -- + // + // Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with + // initial non-inverted CRC I). This function has the following properties: + // (a) CRC(I, AB) = CRC(CRC(I, A), B) + // (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B) + // + // Say we want to compute CRC(I, ABC) where A, B, C are three sequences of + // K bytes each, where K is a fixed constant. Let O be the sequence of K zero + // bytes. + // + // CRC(I, ABC) = CRC(I, ABO xor C) + // = CRC(I, ABO) xor CRC(0, C) + // = CRC(CRC(I, AB), O) xor CRC(0, C) + // = CRC(CRC(I, AO xor B), O) xor CRC(0, C) + // = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C) + // = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C) + // + // The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B), + // and CRC(0, C) efficiently. We just need to find a way to quickly compute + // CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these + // values; since we can't have a 32-bit table, we break it up into four + // 8-bit tables: + // + // CRC(uvwx, O) = CRC(u000, O) xor + // CRC(0v00, O) xor + // CRC(00w0, O) xor + // CRC(000x, O) + // + // We can compute tables corresponding to the four terms for all 8-bit + // values. + + crc = ^crc + + // If a buffer is long enough to use the optimization, process the first few + // bytes to align the buffer to an 8 byte boundary (if necessary). + if len(p) >= castagnoliK1*3 { + delta := int(uintptr(unsafe.Pointer(&p[0])) & 7) + if delta != 0 { + delta = 8 - delta + crc = castagnoliSSE42(crc, p[:delta]) + p = p[delta:] } - return crc } - // only use slicing-by-8 when input is >= 16 Bytes - if len(p) >= 16 { - ieeeTable8Once.Do(func() { - ieeeTable8 = makeTable8(IEEE) - }) - return updateSlicingBy8(crc, ieeeTable8, p) + // Process 3*K2 at a time. + for len(p) >= castagnoliK2*3 { + // Compute CRC(I, A), CRC(0, B), and CRC(0, C). + crcA, crcB, crcC := castagnoliSSE42Triple( + crc, 0, 0, + p, p[castagnoliK2:], p[castagnoliK2*2:], + castagnoliK2/24) + + // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) + crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB + // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) + crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC + p = p[castagnoliK2*3:] + } + + // Process 3*K1 at a time. + for len(p) >= castagnoliK1*3 { + // Compute CRC(I, A), CRC(0, B), and CRC(0, C). + crcA, crcB, crcC := castagnoliSSE42Triple( + crc, 0, 0, + p, p[castagnoliK1:], p[castagnoliK1*2:], + castagnoliK1/24) + + // CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B) + crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB + // CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C) + crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC + p = p[castagnoliK1*3:] + } + + // Use the simple implementation for what's left. + crc = castagnoliSSE42(crc, p) + return ^crc +} + +func archAvailableIEEE() bool { + return useFastIEEE +} + +var archIeeeTable8 *slicing8Table + +func archInitIEEE() { + if !useFastIEEE { + panic("not available") } + // We still use slicing-by-8 for small buffers. + archIeeeTable8 = slicingMakeTable(IEEE) +} - return update(crc, IEEETable, p) +func archUpdateIEEE(crc uint32, p []byte) uint32 { + if !useFastIEEE { + panic("not available") + } + + if len(p) >= 64 { + left := len(p) & 15 + do := len(p) - left + crc = ^ieeeCLMUL(^crc, p[:do]) + p = p[do:] + } + if len(p) == 0 { + return crc + } + return slicingUpdate(crc, archIeeeTable8, p) } diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s index 9bf05d89b8d..e8a7941ce75 100644 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.s +++ b/vendor/github.com/klauspost/crc32/crc32_amd64.s @@ -7,54 +7,136 @@ #define NOSPLIT 4 #define RODATA 8 +// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. +// // func castagnoliSSE42(crc uint32, p []byte) uint32 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 MOVL crc+0(FP), AX // CRC value MOVQ p+8(FP), SI // data pointer MOVQ p_len+16(FP), CX // len(p) - NOTL AX - - // If there's less than 8 bytes to process, we do it byte-by-byte. + // If there are fewer than 8 bytes to process, skip alignment. CMPQ CX, $8 - JL cleanup + JL less_than_8 - // Process individual bytes until the input is 8-byte aligned. -startup: MOVQ SI, BX ANDQ $7, BX JZ aligned + // Process the first few bytes to 8-byte align the input. + + // BX = 8 - BX. We need to process this many bytes to align. + SUBQ $1, BX + XORQ $7, BX + + BTQ $0, BX + JNC align_2 + CRC32B (SI), AX DECQ CX INCQ SI - JMP startup + +align_2: + BTQ $1, BX + JNC align_4 + + // CRC32W (SI), AX + BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + + SUBQ $2, CX + ADDQ $2, SI + +align_4: + BTQ $2, BX + JNC aligned + + // CRC32L (SI), AX + BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + + SUBQ $4, CX + ADDQ $4, SI aligned: // The input is now 8-byte aligned and we can process 8-byte chunks. CMPQ CX, $8 - JL cleanup + JL less_than_8 CRC32Q (SI), AX ADDQ $8, SI SUBQ $8, CX JMP aligned -cleanup: - // We may have some bytes left over that we process one at a time. - CMPQ CX, $0 - JE done +less_than_8: + // We may have some bytes left over; process 4 bytes, then 2, then 1. + BTQ $2, CX + JNC less_than_4 + + // CRC32L (SI), AX + BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + ADDQ $4, SI + +less_than_4: + BTQ $1, CX + JNC less_than_2 + + // CRC32W (SI), AX + BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + ADDQ $2, SI + +less_than_2: + BTQ $0, CX + JNC done CRC32B (SI), AX - INCQ SI - DECQ CX - JMP cleanup done: - NOTL AX MOVL AX, ret+32(FP) RET +// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) +// bytes from each buffer. +// +// func castagnoliSSE42Triple( +// crc1, crc2, crc3 uint32, +// a, b, c []byte, +// rounds uint32, +// ) (retA uint32, retB uint32, retC uint32) +TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 + MOVL crcA+0(FP), AX + MOVL crcB+4(FP), CX + MOVL crcC+8(FP), DX + + MOVQ a+16(FP), R8 // data pointer + MOVQ b+40(FP), R9 // data pointer + MOVQ c+64(FP), R10 // data pointer + + MOVL rounds+88(FP), R11 + +loop: + CRC32Q (R8), AX + CRC32Q (R9), CX + CRC32Q (R10), DX + + CRC32Q 8(R8), AX + CRC32Q 8(R9), CX + CRC32Q 8(R10), DX + + CRC32Q 16(R8), AX + CRC32Q 16(R9), CX + CRC32Q 16(R10), DX + + ADDQ $24, R8 + ADDQ $24, R9 + ADDQ $24, R10 + + DECQ R11 + JNZ loop + + MOVL AX, retA+96(FP) + MOVL CX, retB+100(FP) + MOVL DX, retC+104(FP) + RET + // func haveSSE42() bool TEXT ·haveSSE42(SB), NOSPLIT, $0 XORQ AX, AX @@ -185,7 +267,7 @@ remain64: PXOR X5, X1 PXOR X4, X1 - // More than 16 bytes left? + // If there is less than 16 bytes left we are done CMPQ CX, $16 JB finish diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go index 926473e7c0e..3222b06a5a0 100644 --- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go +++ b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go @@ -9,32 +9,35 @@ package crc32 // This file contains the code to call the SSE 4.2 version of the Castagnoli // CRC. -// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2 +// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2 // support. func haveSSE42() bool -// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32 +// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32 // instruction. //go:noescape func castagnoliSSE42(crc uint32, p []byte) uint32 var sse42 = haveSSE42() -func updateCastagnoli(crc uint32, p []byte) uint32 { - if sse42 { - return castagnoliSSE42(crc, p) - } - return update(crc, castagnoliTable, p) +func archAvailableCastagnoli() bool { + return sse42 } -func updateIEEE(crc uint32, p []byte) uint32 { - // only use slicing-by-8 when input is >= 4KB - if len(p) >= 4096 { - ieeeTable8Once.Do(func() { - ieeeTable8 = makeTable8(IEEE) - }) - return updateSlicingBy8(crc, ieeeTable8, p) +func archInitCastagnoli() { + if !sse42 { + panic("not available") } + // No initialization necessary. +} - return update(crc, IEEETable, p) +func archUpdateCastagnoli(crc uint32, p []byte) uint32 { + if !sse42 { + panic("not available") + } + return castagnoliSSE42(crc, p) } + +func archAvailableIEEE() bool { return false } +func archInitIEEE() { panic("not available") } +func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_generic.go b/vendor/github.com/klauspost/crc32/crc32_generic.go index a53cf96a00b..abacbb663d4 100644 --- a/vendor/github.com/klauspost/crc32/crc32_generic.go +++ b/vendor/github.com/klauspost/crc32/crc32_generic.go @@ -2,28 +2,88 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!amd64p32 appengine gccgo +// This file contains CRC32 algorithms that are not specific to any architecture +// and don't use hardware acceleration. +// +// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table. +// +// The slicing-by-8 algorithm is a faster implementation that uses a bigger +// table (8*256*4 bytes). package crc32 -// This file contains the generic version of updateCastagnoli which does -// slicing-by-8, or uses the fallback for very small sizes. +// simpleMakeTable allocates and constructs a Table for the specified +// polynomial. The table is suitable for use with the simple algorithm +// (simpleUpdate). +func simpleMakeTable(poly uint32) *Table { + t := new(Table) + simplePopulateTable(poly, t) + return t +} + +// simplePopulateTable constructs a Table for the specified polynomial, suitable +// for use with simpleUpdate. +func simplePopulateTable(poly uint32, t *Table) { + for i := 0; i < 256; i++ { + crc := uint32(i) + for j := 0; j < 8; j++ { + if crc&1 == 1 { + crc = (crc >> 1) ^ poly + } else { + crc >>= 1 + } + } + t[i] = crc + } +} + +// simpleUpdate uses the simple algorithm to update the CRC, given a table that +// was previously computed using simpleMakeTable. +func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 { + crc = ^crc + for _, v := range p { + crc = tab[byte(crc)^v] ^ (crc >> 8) + } + return ^crc +} + +// Use slicing-by-8 when payload >= this value. +const slicing8Cutoff = 16 -func updateCastagnoli(crc uint32, p []byte) uint32 { - // only use slicing-by-8 when input is >= 16 Bytes - if len(p) >= 16 { - return updateSlicingBy8(crc, castagnoliTable8, p) +// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm. +type slicing8Table [8]Table + +// slicingMakeTable constructs a slicing8Table for the specified polynomial. The +// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate). +func slicingMakeTable(poly uint32) *slicing8Table { + t := new(slicing8Table) + simplePopulateTable(poly, &t[0]) + for i := 0; i < 256; i++ { + crc := t[0][i] + for j := 1; j < 8; j++ { + crc = t[0][crc&0xFF] ^ (crc >> 8) + t[j][i] = crc + } } - return update(crc, castagnoliTable, p) + return t } -func updateIEEE(crc uint32, p []byte) uint32 { - // only use slicing-by-8 when input is >= 16 Bytes - if len(p) >= 16 { - ieeeTable8Once.Do(func() { - ieeeTable8 = makeTable8(IEEE) - }) - return updateSlicingBy8(crc, ieeeTable8, p) +// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a +// table that was previously computed using slicingMakeTable. +func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 { + if len(p) >= slicing8Cutoff { + crc = ^crc + for len(p) > 8 { + crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 + crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ + tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ + tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] + p = p[8:] + } + crc = ^crc + } + if len(p) == 0 { + return crc } - return update(crc, IEEETable, p) + return simpleUpdate(crc, &tab[0], p) } diff --git a/vendor/github.com/klauspost/crc32/crc32_otherarch.go b/vendor/github.com/klauspost/crc32/crc32_otherarch.go new file mode 100644 index 00000000000..cc960764bce --- /dev/null +++ b/vendor/github.com/klauspost/crc32/crc32_otherarch.go @@ -0,0 +1,15 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!amd64p32,!s390x + +package crc32 + +func archAvailableIEEE() bool { return false } +func archInitIEEE() { panic("not available") } +func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } + +func archAvailableCastagnoli() bool { return false } +func archInitCastagnoli() { panic("not available") } +func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") } diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.go b/vendor/github.com/klauspost/crc32/crc32_s390x.go new file mode 100644 index 00000000000..ce96f032819 --- /dev/null +++ b/vendor/github.com/klauspost/crc32/crc32_s390x.go @@ -0,0 +1,91 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x + +package crc32 + +const ( + vxMinLen = 64 + vxAlignMask = 15 // align to 16 bytes +) + +// hasVectorFacility reports whether the machine has the z/Architecture +// vector facility installed and enabled. +func hasVectorFacility() bool + +var hasVX = hasVectorFacility() + +// vectorizedCastagnoli implements CRC32 using vector instructions. +// It is defined in crc32_s390x.s. +//go:noescape +func vectorizedCastagnoli(crc uint32, p []byte) uint32 + +// vectorizedIEEE implements CRC32 using vector instructions. +// It is defined in crc32_s390x.s. +//go:noescape +func vectorizedIEEE(crc uint32, p []byte) uint32 + +func archAvailableCastagnoli() bool { + return hasVX +} + +var archCastagnoliTable8 *slicing8Table + +func archInitCastagnoli() { + if !hasVX { + panic("not available") + } + // We still use slicing-by-8 for small buffers. + archCastagnoliTable8 = slicingMakeTable(Castagnoli) +} + +// archUpdateCastagnoli calculates the checksum of p using +// vectorizedCastagnoli. +func archUpdateCastagnoli(crc uint32, p []byte) uint32 { + if !hasVX { + panic("not available") + } + // Use vectorized function if data length is above threshold. + if len(p) >= vxMinLen { + aligned := len(p) & ^vxAlignMask + crc = vectorizedCastagnoli(crc, p[:aligned]) + p = p[aligned:] + } + if len(p) == 0 { + return crc + } + return slicingUpdate(crc, archCastagnoliTable8, p) +} + +func archAvailableIEEE() bool { + return hasVX +} + +var archIeeeTable8 *slicing8Table + +func archInitIEEE() { + if !hasVX { + panic("not available") + } + // We still use slicing-by-8 for small buffers. + archIeeeTable8 = slicingMakeTable(IEEE) +} + +// archUpdateIEEE calculates the checksum of p using vectorizedIEEE. +func archUpdateIEEE(crc uint32, p []byte) uint32 { + if !hasVX { + panic("not available") + } + // Use vectorized function if data length is above threshold. + if len(p) >= vxMinLen { + aligned := len(p) & ^vxAlignMask + crc = vectorizedIEEE(crc, p[:aligned]) + p = p[aligned:] + } + if len(p) == 0 { + return crc + } + return slicingUpdate(crc, archIeeeTable8, p) +} diff --git a/vendor/github.com/klauspost/crc32/crc32_s390x.s b/vendor/github.com/klauspost/crc32/crc32_s390x.s new file mode 100644 index 00000000000..e980ca29d6d --- /dev/null +++ b/vendor/github.com/klauspost/crc32/crc32_s390x.s @@ -0,0 +1,249 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x + +#include "textflag.h" + +// Vector register range containing CRC-32 constants + +#define CONST_PERM_LE2BE V9 +#define CONST_R2R1 V10 +#define CONST_R4R3 V11 +#define CONST_R5 V12 +#define CONST_RU_POLY V13 +#define CONST_CRC_POLY V14 + +// The CRC-32 constant block contains reduction constants to fold and +// process particular chunks of the input data stream in parallel. +// +// Note that the constant definitions below are extended in order to compute +// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. +// The rightmost doubleword can be 0 to prevent contribution to the result or +// can be multiplied by 1 to perform an XOR without the need for a separate +// VECTOR EXCLUSIVE OR instruction. +// +// The polynomials used are bit-reflected: +// +// IEEE: P'(x) = 0x0edb88320 +// Castagnoli: P'(x) = 0x082f63b78 + +// IEEE polynomial constants +DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask +DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 +DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 +DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 +DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 +DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 +DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 +DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 +DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 +DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' +DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 +DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 + +GLOBL ·crcleconskp(SB), RODATA, $144 + +// Castagonli Polynomial constants +DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask +DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 +DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 +DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 +DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 +DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 +DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 +DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 +DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 +DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' +DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 +DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 + +GLOBL ·crccleconskp(SB), RODATA, $144 + +// func hasVectorFacility() bool +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + +novector: + MOVB $0, ret+0(FP) // no vx + RET + +// The CRC-32 function(s) use these calling conventions: +// +// Parameters: +// +// R2: Initial CRC value, typically ~0; and final CRC (return) value. +// R3: Input buffer pointer, performance might be improved if the +// buffer is on a doubleword boundary. +// R4: Length of the buffer, must be 64 bytes or greater. +// +// Register usage: +// +// R5: CRC-32 constant pool base pointer. +// V0: Initial CRC value and intermediate constants and results. +// V1..V4: Data for CRC computation. +// V5..V8: Next data chunks that are fetched from the input buffer. +// +// V9..V14: CRC-32 constants. + +// func vectorizedIEEE(crc uint32, p []byte) uint32 +TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 + MOVWZ crc+0(FP), R2 // R2 stores the CRC value + MOVD p+8(FP), R3 // data pointer + MOVD p_len+16(FP), R4 // len(p) + + MOVD $·crcleconskp(SB), R5 + BR vectorizedBody<>(SB) + +// func vectorizedCastagnoli(crc uint32, p []byte) uint32 +TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 + MOVWZ crc+0(FP), R2 // R2 stores the CRC value + MOVD p+8(FP), R3 // data pointer + MOVD p_len+16(FP), R4 // len(p) + + // R5: crc-32 constant pool base pointer, constant is used to reduce crc + MOVD $·crccleconskp(SB), R5 + BR vectorizedBody<>(SB) + +TEXT vectorizedBody<>(SB), NOSPLIT, $0 + XOR $0xffffffff, R2 // NOTW R2 + VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY + + // Load the initial CRC value into the rightmost word of V0 + VZERO V0 + VLVGF $3, R2, V0 + + // Crash if the input size is less than 64-bytes. + CMP R4, $64 + BLT crash + + // Load a 64-byte data chunk and XOR with CRC + VLM 0(R3), V1, V4 // 64-bytes into V1..V4 + + // Reflect the data if the CRC operation is in the bit-reflected domain + VPERM V1, V1, CONST_PERM_LE2BE, V1 + VPERM V2, V2, CONST_PERM_LE2BE, V2 + VPERM V3, V3, CONST_PERM_LE2BE, V3 + VPERM V4, V4, CONST_PERM_LE2BE, V4 + + VX V0, V1, V1 // V1 ^= CRC + ADD $64, R3 // BUF = BUF + 64 + ADD $(-64), R4 + + // Check remaining buffer size and jump to proper folding method + CMP R4, $64 + BLT less_than_64bytes + +fold_64bytes_loop: + // Load the next 64-byte data chunk into V5 to V8 + VLM 0(R3), V5, V8 + VPERM V5, V5, CONST_PERM_LE2BE, V5 + VPERM V6, V6, CONST_PERM_LE2BE, V6 + VPERM V7, V7, CONST_PERM_LE2BE, V7 + VPERM V8, V8, CONST_PERM_LE2BE, V8 + + // Perform a GF(2) multiplication of the doublewords in V1 with + // the reduction constants in V0. The intermediate result is + // then folded (accumulated) with the next data chunk in V5 and + // stored in V1. Repeat this step for the register contents + // in V2, V3, and V4 respectively. + + VGFMAG CONST_R2R1, V1, V5, V1 + VGFMAG CONST_R2R1, V2, V6, V2 + VGFMAG CONST_R2R1, V3, V7, V3 + VGFMAG CONST_R2R1, V4, V8, V4 + + // Adjust buffer pointer and length for next loop + ADD $64, R3 // BUF = BUF + 64 + ADD $(-64), R4 // LEN = LEN - 64 + + CMP R4, $64 + BGE fold_64bytes_loop + +less_than_64bytes: + // Fold V1 to V4 into a single 128-bit value in V1 + VGFMAG CONST_R4R3, V1, V2, V1 + VGFMAG CONST_R4R3, V1, V3, V1 + VGFMAG CONST_R4R3, V1, V4, V1 + + // Check whether to continue with 64-bit folding + CMP R4, $16 + BLT final_fold + +fold_16bytes_loop: + VL 0(R3), V2 // Load next data chunk + VPERM V2, V2, CONST_PERM_LE2BE, V2 + + VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk + + // Adjust buffer pointer and size for folding next data chunk + ADD $16, R3 + ADD $-16, R4 + + // Process remaining data chunks + CMP R4, $16 + BGE fold_16bytes_loop + +final_fold: + VLEIB $7, $0x40, V9 + VSRLB V9, CONST_R4R3, V0 + VLEIG $0, $1, V0 + + VGFMG V0, V1, V1 + + VLEIB $7, $0x20, V9 // Shift by words + VSRLB V9, V1, V2 // Store remaining bits in V2 + VUPLLF V1, V1 // Split rightmost doubleword + VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 + + // The input values to the Barret reduction are the degree-63 polynomial + // in V1 (R(x)), degree-32 generator polynomial, and the reduction + // constant u. The Barret reduction result is the CRC value of R(x) mod + // P(x). + // + // The Barret reduction algorithm is defined as: + // + // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + // 3. C(x) = R(x) XOR T2(x) mod x^32 + // + // Note: To compensate the division by x^32, use the vector unpack + // instruction to move the leftmost word into the leftmost doubleword + // of the vector register. The rightmost doubleword is multiplied + // with zero to not contribute to the intermedate results. + + // T1(x) = floor( R(x) / x^32 ) GF2MUL u + VUPLLF V1, V2 + VGFMG CONST_RU_POLY, V2, V2 + + // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + // V2 and XOR the intermediate result, T2(x), with the value in V1. + // The final result is in the rightmost word of V2. + + VUPLLF V2, V2 + VGFMAG CONST_CRC_POLY, V2, V1, V2 + +done: + VLGVF $2, V2, R2 + XOR $0xffffffff, R2 // NOTW R2 + MOVWZ R2, ret + 32(FP) + RET + +crash: + MOVD $0, (R0) // input size is less than 64-bytes diff --git a/vendor/vendor.json b/vendor/vendor.json index cb8d374e6a7..414e5c6b404 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -250,10 +250,10 @@ "revisionTime": "2014-10-17T13:07:13-07:00" }, { - "checksumSHA1": "hX+j017F4PEtIUGflAneDtLC6hs=", + "checksumSHA1": "BM6ZlNJmtKy3GBoWwg2X55gnZ4A=", "path": "github.com/klauspost/crc32", - "revision": "19b0b332c9e4516a6370a0456e6182c3b5036720", - "revisionTime": "2016-02-19T14:26:09Z" + "revision": "cb6bfca970f6908083f26f39a79009d608efd5cd", + "revisionTime": "2016-10-16T15:41:25Z" }, { "path": "github.com/kr/pretty",