From 43b9e89bdebe625745a13005cde04d2d0b242a83 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 3 Mar 2023 16:01:27 +0100 Subject: [PATCH 1/3] Add Intel LZ4s converter --- internal/lz4ref/block.go | 212 +++++++++++ s2/_generate/gen.go | 28 +- s2/encodeblock_amd64.go | 12 +- s2/encodeblock_amd64.s | 753 +++++++++++++++++++++++++++++++++++++++ s2/lz4sconvert.go | 467 ++++++++++++++++++++++++ s2/lz4sconvert_test.go | 102 ++++++ 6 files changed, 1567 insertions(+), 7 deletions(-) create mode 100644 s2/lz4sconvert.go create mode 100644 s2/lz4sconvert_test.go diff --git a/internal/lz4ref/block.go b/internal/lz4ref/block.go index cedcd98bc2..7ac94132a8 100644 --- a/internal/lz4ref/block.go +++ b/internal/lz4ref/block.go @@ -86,6 +86,13 @@ func CompressBlock(src, dst []byte) (int, error) { return n, err } +func CompressBlockLZ4s(src, dst []byte) (int, error) { + c := compressorPool.Get().(*Compressor) + n, err := c.CompressBlockLZ4s(src, dst) + compressorPool.Put(c) + return n, err +} + func (c *Compressor) CompressBlock(src, dst []byte) (int, error) { // Zero out reused table to avoid non-deterministic output (issue #65). c.reset() @@ -290,6 +297,211 @@ lastLiterals: return di, nil } +func (c *Compressor) CompressBlockLZ4s(src, dst []byte) (int, error) { + // Zero out reused table to avoid non-deterministic output (issue #65). + c.reset() + + const debug = false + const minMatch = 3 + + if debug { + fmt.Printf("lz4 block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) + } + + // Return 0, nil only if the destination buffer size is < CompressBlockBound. + isNotCompressible := len(dst) < CompressBlockBound(len(src)) + + // adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible. + // This significantly speeds up incompressible data and usually has very small impact on compression. + // bytes to skip = 1 + (bytes since last match >> adaptSkipLog) + const adaptSkipLog = 7 + + // si: Current position of the search. + // anchor: Position of the current literals. + var si, di, anchor int + sn := len(src) - mfLimit + if sn <= 0 { + goto lastLiterals + } + + // Fast scan strategy: the hash table only stores the last five-byte sequences. + for si < sn { + // Hash the next five bytes (sequence)... + match := binary.LittleEndian.Uint64(src[si:]) + h := blockHash(match) + h2 := blockHash(match >> 8) + + // We check a match at s, s+1 and s+2 and pick the first one we get. + // Checking 3 only requires us to load the source one. + ref := c.get(h, si) + ref2 := c.get(h2, si+1) + c.put(h, si) + c.put(h2, si+1) + + offset := si - ref + + if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) { + // No match. Start calculating another hash. + // The processor can usually do this out-of-order. + h = blockHash(match >> 16) + ref3 := c.get(h, si+2) + + // Check the second match at si+1 + si += 1 + offset = si - ref2 + + if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) { + // No match. Check the third match at si+2 + si += 1 + offset = si - ref3 + c.put(h, si) + + if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) { + // Skip one extra byte (at si+3) before we check 3 matches again. + si += 2 + (si-anchor)>>adaptSkipLog + continue + } + } + } + + // Match found. + lLen := si - anchor // Literal length. + // We already matched 4 bytes. + mLen := 4 + + // Extend backwards if we can, reducing literals. + tOff := si - offset - 1 + for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] { + si-- + tOff-- + lLen-- + mLen++ + } + + // Add the match length, so we continue search at the end. + // Use mLen to store the offset base. + si, mLen = si+mLen, si+minMatch + + // Find the longest match by looking by batches of 8 bytes. + for si+8 <= sn { + x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:]) + if x == 0 { + si += 8 + } else { + // Stop is first non-zero byte. + si += bits.TrailingZeros64(x) >> 3 + break + } + } + + mLen = si - mLen + if di >= len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + if mLen < 0xF { + dst[di] = byte(mLen) + } else { + dst[di] = 0xF + } + + // Encode literals length. + if debug { + fmt.Printf("emit %d literals\n", lLen) + } + if lLen < 0xF { + dst[di] |= byte(lLen << 4) + } else { + dst[di] |= 0xF0 + di++ + l := lLen - 0xF + for ; l >= 0xFF && di < len(dst); l -= 0xFF { + dst[di] = 0xFF + di++ + } + if di >= len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + dst[di] = byte(l) + } + di++ + + // Literals. + if di+lLen > len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + copy(dst[di:di+lLen], src[anchor:anchor+lLen]) + di += lLen + 2 + anchor = si + + // Encode offset. + if debug { + fmt.Printf("emit copy, length: %d, offset: %d\n", mLen+minMatch, offset) + } + if di > len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + dst[di-2], dst[di-1] = byte(offset), byte(offset>>8) + + // Encode match length part 2. + if mLen >= 0xF { + for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF { + dst[di] = 0xFF + di++ + } + if di >= len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + dst[di] = byte(mLen) + di++ + } + // Check if we can load next values. + if si >= sn { + break + } + // Hash match end-2 + h = blockHash(binary.LittleEndian.Uint64(src[si-2:])) + c.put(h, si-2) + } + +lastLiterals: + if isNotCompressible && anchor == 0 { + // Incompressible. + return 0, nil + } + + // Last literals. + if di >= len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + lLen := len(src) - anchor + if lLen < 0xF { + dst[di] = byte(lLen << 4) + } else { + dst[di] = 0xF0 + di++ + for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF { + dst[di] = 0xFF + di++ + } + if di >= len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + dst[di] = byte(lLen) + } + di++ + + // Write the last literals. + if isNotCompressible && di >= anchor { + // Incompressible. + return 0, nil + } + if di+len(src)-anchor > len(dst) { + return 0, ErrInvalidSourceShortBuffer + } + di += copy(dst[di:di+len(src)-anchor], src[anchor:]) + return di, nil +} + func UncompressBlock(dst, src []byte) (ret int) { // Restrict capacities so we don't read or write out of bounds. dst = dst[:len(dst):len(dst)] diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index c5be5443de..a81393303a 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -104,9 +104,11 @@ func main() { o.genEmitCopyNoRepeat() o.snappy = false o.genMatchLen() - o.cvtLZ4BlockAsm() + o.cvtLZ4BlockAsm(false) + o.cvtLZ4BlockAsm(true) o.snappy = true - o.cvtLZ4BlockAsm() + o.cvtLZ4BlockAsm(false) + o.cvtLZ4BlockAsm(true) Generate() } @@ -2862,15 +2864,22 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) return matched } -func (o options) cvtLZ4BlockAsm() { +func (o options) cvtLZ4BlockAsm(lz4s bool) { snap := "Asm" name := "lz4_s2_" + srcAlgo := "LZ4" + dstAlgo := "S2" if o.snappy { snap = "SnappyAsm" name = "lz4_snappy_" + dstAlgo = "Snappy" } - TEXT("cvtLZ4Block"+snap, NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)") - Doc("cvtLZ4Block converts an LZ4 block to S2", "") + if lz4s { + name = strings.ReplaceAll(name, "lz4", "lz4s") + srcAlgo = "LZ4s" + } + TEXT("cvt"+srcAlgo+"Block"+snap, NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)") + Doc("cvt"+srcAlgo+"Block converts an "+srcAlgo+" block to "+dstAlgo, "") Pragma("noescape") o.outputMargin = 10 o.maxOffset = math.MaxUint16 @@ -2914,7 +2923,10 @@ func (o options) cvtLZ4BlockAsm() { JAE(LabelRef(name + "dstfull")) } - const lz4MinMatch = 4 + var lz4MinMatch = 4 + if lz4s { + lz4MinMatch = 3 + } Label(name + "loop") checkSrc(src) @@ -2971,6 +2983,10 @@ func (o options) cvtLZ4BlockAsm() { JMP(LabelRef(name + "corrupt")) Label(name + "match") + if lz4s { + CMPQ(ml, U8(lz4MinMatch)) + JEQ(LabelRef(name + "loop")) + } // if s >= len(src)-2 { end := GP64() LEAQ(Mem{Base: src, Disp: 2}, end) diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 9f3dc8c29f..297e41501b 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -212,7 +212,17 @@ func matchLen(a []byte, b []byte) int //go:noescape func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// cvtLZ4Block converts an LZ4 block to S2 +// cvtLZ4sBlock converts an LZ4s block to S2 +// +//go:noescape +func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) + +// cvtLZ4Block converts an LZ4 block to Snappy // //go:noescape func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) + +// cvtLZ4sBlock converts an LZ4s block to Snappy +// +//go:noescape +func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 19bd5237bc..12a4de3be5 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -19271,6 +19271,491 @@ lz4_s2_dstfull: MOVQ SI, uncompressed+48(FP) RET +// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) +// Requires: SSE2 +TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 + XORQ SI, SI + MOVQ dst_base+0(FP), AX + MOVQ dst_len+8(FP), CX + MOVQ src_base+24(FP), DX + MOVQ src_len+32(FP), BX + LEAQ (DX)(BX*1), BX + LEAQ -10(AX)(CX*1), CX + XORQ DI, DI + +lz4s_s2_loop: + CMPQ DX, BX + JAE lz4s_s2_corrupt + CMPQ AX, CX + JAE lz4s_s2_dstfull + MOVBQZX (DX), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x04, R9 + ANDQ $0x0f, R10 + CMPQ R8, $0xf0 + JB lz4s_s2_ll_end + +lz4s_s2_ll_loop: + INCQ DX + CMPQ DX, BX + JAE lz4s_s2_corrupt + MOVBQZX (DX), R8 + ADDQ R8, R9 + CMPQ R8, $0xff + JEQ lz4s_s2_ll_loop + +lz4s_s2_ll_end: + LEAQ (DX)(R9*1), R8 + ADDQ $0x03, R10 + CMPQ R8, BX + JAE lz4s_s2_corrupt + INCQ DX + INCQ R8 + TESTQ R9, R9 + JZ lz4s_s2_lits_done + LEAQ (AX)(R9*1), R11 + CMPQ R11, CX + JAE lz4s_s2_dstfull + ADDQ R9, SI + LEAL -1(R9), R11 + CMPL R11, $0x3c + JLT one_byte_lz4s_s2 + CMPL R11, $0x00000100 + JLT two_bytes_lz4s_s2 + CMPL R11, $0x00010000 + JLT three_bytes_lz4s_s2 + CMPL R11, $0x01000000 + JLT four_bytes_lz4s_s2 + MOVB $0xfc, (AX) + MOVL R11, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_lz4s_s2 + +four_bytes_lz4s_s2: + MOVL R11, R12 + SHRL $0x10, R12 + MOVB $0xf8, (AX) + MOVW R11, 1(AX) + MOVB R12, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_lz4s_s2 + +three_bytes_lz4s_s2: + MOVB $0xf4, (AX) + MOVW R11, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_lz4s_s2 + +two_bytes_lz4s_s2: + MOVB $0xf0, (AX) + MOVB R11, 1(AX) + ADDQ $0x02, AX + CMPL R11, $0x40 + JL memmove_lz4s_s2 + JMP memmove_long_lz4s_s2 + +one_byte_lz4s_s2: + SHLB $0x02, R11 + MOVB R11, (AX) + ADDQ $0x01, AX + +memmove_lz4s_s2: + LEAQ (AX)(R9*1), R11 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_lz4s_s2_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 + JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 + +emit_lit_memmove_lz4s_s2_memmove_move_8: + MOVQ (DX), R12 + MOVQ R12, (AX) + JMP memmove_end_copy_lz4s_s2 + +emit_lit_memmove_lz4s_s2_memmove_move_8through16: + MOVQ (DX), R12 + MOVQ -8(DX)(R9*1), DX + MOVQ R12, (AX) + MOVQ DX, -8(AX)(R9*1) + JMP memmove_end_copy_lz4s_s2 + +emit_lit_memmove_lz4s_s2_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_lz4s_s2 + +emit_lit_memmove_lz4s_s2_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R9*1), X2 + MOVOU -16(DX)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_lz4s_s2: + MOVQ R11, AX + JMP lz4s_s2_lits_emit_done + +memmove_long_lz4s_s2: + LEAQ (AX)(R9*1), R11 + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R9*1), X2 + MOVOU -16(DX)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R12 + ANDL $0x0000001f, R12 + MOVQ $0x00000040, R14 + SUBQ R12, R14 + DECQ R13 + JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 + LEAQ -32(DX)(R14*1), R12 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_lz4s_s2large_big_loop_back: + MOVOU (R12), X4 + MOVOU 16(R12), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R12 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back + +emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: + MOVOU -32(DX)(R14*1), X4 + MOVOU -16(DX)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R11, AX + +lz4s_s2_lits_emit_done: + MOVQ R8, DX + +lz4s_s2_lits_done: + CMPQ DX, BX + JNE lz4s_s2_match + CMPQ R10, $0x03 + JEQ lz4s_s2_done + JMP lz4s_s2_corrupt + +lz4s_s2_match: + CMPQ R10, $0x03 + JEQ lz4s_s2_loop + LEAQ 2(DX), R8 + CMPQ R8, BX + JAE lz4s_s2_corrupt + MOVWQZX (DX), R9 + MOVQ R8, DX + TESTQ R9, R9 + JZ lz4s_s2_corrupt + CMPQ R9, SI + JA lz4s_s2_corrupt + CMPQ R10, $0x12 + JNE lz4s_s2_ml_done + +lz4s_s2_ml_loop: + MOVBQZX (DX), R8 + INCQ DX + ADDQ R8, R10 + CMPQ DX, BX + JAE lz4s_s2_corrupt + CMPQ R8, $0xff + JEQ lz4s_s2_ml_loop + +lz4s_s2_ml_done: + ADDQ R10, SI + CMPQ R9, DI + JNE lz4s_s2_docopy + + // emitRepeat +emit_repeat_again_lz4_s2: + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2 + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2 + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2 + +cant_repeat_two_offset_lz4_s2: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2 + CMPL R10, $0x00010100 + JLT repeat_four_lz4_s2 + CMPL R10, $0x0100ffff + JLT repeat_five_lz4_s2 + LEAL -16842747(R10), R10 + MOVL $0xfffb001d, (AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_lz4_s2 + +repeat_five_lz4_s2: + LEAL -65536(R10), R10 + MOVL R10, R9 + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, R9 + MOVB R9, 4(AX) + ADDQ $0x05, AX + JMP lz4s_s2_loop + +repeat_four_lz4_s2: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4s_s2_loop + +repeat_three_lz4_s2: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4s_s2_loop + +repeat_two_lz4_s2: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +repeat_two_offset_lz4_s2: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +lz4s_s2_docopy: + MOVQ R9, DI + + // emitCopy + CMPL R10, $0x40 + JLE two_byte_offset_short_lz4_s2 + CMPL R9, $0x00000800 + JAE long_offset_short_lz4_s2 + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB R9, 1(AX) + MOVL R9, R11 + SHRL $0x08, R11 + SHLL $0x05, R11 + ORL R11, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b + +emit_repeat_again_lz4_s2_emit_copy_short_2b: + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2_emit_copy_short_2b + +cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2_emit_copy_short_2b + CMPL R10, $0x00010100 + JLT repeat_four_lz4_s2_emit_copy_short_2b + CMPL R10, $0x0100ffff + JLT repeat_five_lz4_s2_emit_copy_short_2b + LEAL -16842747(R10), R10 + MOVL $0xfffb001d, (AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_lz4_s2_emit_copy_short_2b + +repeat_five_lz4_s2_emit_copy_short_2b: + LEAL -65536(R10), R10 + MOVL R10, R9 + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, R9 + MOVB R9, 4(AX) + ADDQ $0x05, AX + JMP lz4s_s2_loop + +repeat_four_lz4_s2_emit_copy_short_2b: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4s_s2_loop + +repeat_three_lz4_s2_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4s_s2_loop + +repeat_two_lz4_s2_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +repeat_two_offset_lz4_s2_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +long_offset_short_lz4_s2: + MOVB $0xee, (AX) + MOVW R9, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat +emit_repeat_again_lz4_s2_emit_copy_short: + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2_emit_copy_short + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2_emit_copy_short + +cant_repeat_two_offset_lz4_s2_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2_emit_copy_short + CMPL R10, $0x00010100 + JLT repeat_four_lz4_s2_emit_copy_short + CMPL R10, $0x0100ffff + JLT repeat_five_lz4_s2_emit_copy_short + LEAL -16842747(R10), R10 + MOVL $0xfffb001d, (AX) + MOVB $0xff, 4(AX) + ADDQ $0x05, AX + JMP emit_repeat_again_lz4_s2_emit_copy_short + +repeat_five_lz4_s2_emit_copy_short: + LEAL -65536(R10), R10 + MOVL R10, R9 + MOVW $0x001d, (AX) + MOVW R10, 2(AX) + SARL $0x10, R9 + MOVB R9, 4(AX) + ADDQ $0x05, AX + JMP lz4s_s2_loop + +repeat_four_lz4_s2_emit_copy_short: + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4s_s2_loop + +repeat_three_lz4_s2_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4s_s2_loop + +repeat_two_lz4_s2_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +repeat_two_offset_lz4_s2_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +two_byte_offset_short_lz4_s2: + MOVL R10, R8 + SHLL $0x02, R8 + CMPL R10, $0x0c + JGE emit_copy_three_lz4_s2 + CMPL R9, $0x00000800 + JGE emit_copy_three_lz4_s2 + LEAL -15(R8), R8 + MOVB R9, 1(AX) + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + JMP lz4s_s2_loop + +emit_copy_three_lz4_s2: + LEAL -2(R8), R8 + MOVB R8, (AX) + MOVW R9, 1(AX) + ADDQ $0x03, AX + JMP lz4s_s2_loop + +lz4s_s2_done: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ SI, uncompressed+48(FP) + MOVQ AX, dstUsed+56(FP) + RET + +lz4s_s2_corrupt: + XORQ AX, AX + LEAQ -1(AX), SI + MOVQ SI, uncompressed+48(FP) + RET + +lz4s_s2_dstfull: + XORQ AX, AX + LEAQ -2(AX), SI + MOVQ SI, uncompressed+48(FP) + RET + // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 @@ -19536,3 +20021,271 @@ lz4_snappy_dstfull: LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET + +// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) +// Requires: SSE2 +TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 + XORQ SI, SI + MOVQ dst_base+0(FP), AX + MOVQ dst_len+8(FP), CX + MOVQ src_base+24(FP), DX + MOVQ src_len+32(FP), BX + LEAQ (DX)(BX*1), BX + LEAQ -10(AX)(CX*1), CX + +lz4s_snappy_loop: + CMPQ DX, BX + JAE lz4s_snappy_corrupt + CMPQ AX, CX + JAE lz4s_snappy_dstfull + MOVBQZX (DX), DI + MOVQ DI, R8 + MOVQ DI, R9 + SHRQ $0x04, R8 + ANDQ $0x0f, R9 + CMPQ DI, $0xf0 + JB lz4s_snappy_ll_end + +lz4s_snappy_ll_loop: + INCQ DX + CMPQ DX, BX + JAE lz4s_snappy_corrupt + MOVBQZX (DX), DI + ADDQ DI, R8 + CMPQ DI, $0xff + JEQ lz4s_snappy_ll_loop + +lz4s_snappy_ll_end: + LEAQ (DX)(R8*1), DI + ADDQ $0x03, R9 + CMPQ DI, BX + JAE lz4s_snappy_corrupt + INCQ DX + INCQ DI + TESTQ R8, R8 + JZ lz4s_snappy_lits_done + LEAQ (AX)(R8*1), R10 + CMPQ R10, CX + JAE lz4s_snappy_dstfull + ADDQ R8, SI + LEAL -1(R8), R10 + CMPL R10, $0x3c + JLT one_byte_lz4s_snappy + CMPL R10, $0x00000100 + JLT two_bytes_lz4s_snappy + CMPL R10, $0x00010000 + JLT three_bytes_lz4s_snappy + CMPL R10, $0x01000000 + JLT four_bytes_lz4s_snappy + MOVB $0xfc, (AX) + MOVL R10, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_lz4s_snappy + +four_bytes_lz4s_snappy: + MOVL R10, R11 + SHRL $0x10, R11 + MOVB $0xf8, (AX) + MOVW R10, 1(AX) + MOVB R11, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_lz4s_snappy + +three_bytes_lz4s_snappy: + MOVB $0xf4, (AX) + MOVW R10, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_lz4s_snappy + +two_bytes_lz4s_snappy: + MOVB $0xf0, (AX) + MOVB R10, 1(AX) + ADDQ $0x02, AX + CMPL R10, $0x40 + JL memmove_lz4s_snappy + JMP memmove_long_lz4s_snappy + +one_byte_lz4s_snappy: + SHLB $0x02, R10 + MOVB R10, (AX) + ADDQ $0x01, AX + +memmove_lz4s_snappy: + LEAQ (AX)(R8*1), R10 + + // genMemMoveShort + CMPQ R8, $0x08 + JLE emit_lit_memmove_lz4s_snappy_memmove_move_8 + CMPQ R8, $0x10 + JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 + CMPQ R8, $0x20 + JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 + JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 + +emit_lit_memmove_lz4s_snappy_memmove_move_8: + MOVQ (DX), R11 + MOVQ R11, (AX) + JMP memmove_end_copy_lz4s_snappy + +emit_lit_memmove_lz4s_snappy_memmove_move_8through16: + MOVQ (DX), R11 + MOVQ -8(DX)(R8*1), DX + MOVQ R11, (AX) + MOVQ DX, -8(AX)(R8*1) + JMP memmove_end_copy_lz4s_snappy + +emit_lit_memmove_lz4s_snappy_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(R8*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R8*1) + JMP memmove_end_copy_lz4s_snappy + +emit_lit_memmove_lz4s_snappy_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R8*1), X2 + MOVOU -16(DX)(R8*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + +memmove_end_copy_lz4s_snappy: + MOVQ R10, AX + JMP lz4s_snappy_lits_emit_done + +memmove_long_lz4s_snappy: + LEAQ (AX)(R8*1), R10 + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R8*1), X2 + MOVOU -16(DX)(R8*1), X3 + MOVQ R8, R12 + SHRQ $0x05, R12 + MOVQ AX, R11 + ANDL $0x0000001f, R11 + MOVQ $0x00000040, R13 + SUBQ R11, R13 + DECQ R12 + JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 + LEAQ -32(DX)(R13*1), R11 + LEAQ -32(AX)(R13*1), R14 + +emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: + MOVOU (R11), X4 + MOVOU 16(R11), X5 + MOVOA X4, (R14) + MOVOA X5, 16(R14) + ADDQ $0x20, R14 + ADDQ $0x20, R11 + ADDQ $0x20, R13 + DECQ R12 + JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back + +emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: + MOVOU -32(DX)(R13*1), X4 + MOVOU -16(DX)(R13*1), X5 + MOVOA X4, -32(AX)(R13*1) + MOVOA X5, -16(AX)(R13*1) + ADDQ $0x20, R13 + CMPQ R8, R13 + JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) + MOVQ R10, AX + +lz4s_snappy_lits_emit_done: + MOVQ DI, DX + +lz4s_snappy_lits_done: + CMPQ DX, BX + JNE lz4s_snappy_match + CMPQ R9, $0x03 + JEQ lz4s_snappy_done + JMP lz4s_snappy_corrupt + +lz4s_snappy_match: + CMPQ R9, $0x03 + JEQ lz4s_snappy_loop + LEAQ 2(DX), DI + CMPQ DI, BX + JAE lz4s_snappy_corrupt + MOVWQZX (DX), R8 + MOVQ DI, DX + TESTQ R8, R8 + JZ lz4s_snappy_corrupt + CMPQ R8, SI + JA lz4s_snappy_corrupt + CMPQ R9, $0x12 + JNE lz4s_snappy_ml_done + +lz4s_snappy_ml_loop: + MOVBQZX (DX), DI + INCQ DX + ADDQ DI, R9 + CMPQ DX, BX + JAE lz4s_snappy_corrupt + CMPQ DI, $0xff + JEQ lz4s_snappy_ml_loop + +lz4s_snappy_ml_done: + ADDQ R9, SI + + // emitCopy +two_byte_offset_lz4_s2: + CMPL R9, $0x40 + JLE two_byte_offset_short_lz4_s2 + MOVB $0xee, (AX) + MOVW R8, 1(AX) + LEAL -60(R9), R9 + ADDQ $0x03, AX + CMPQ AX, CX + JAE lz4s_snappy_loop + JMP two_byte_offset_lz4_s2 + +two_byte_offset_short_lz4_s2: + MOVL R9, DI + SHLL $0x02, DI + CMPL R9, $0x0c + JGE emit_copy_three_lz4_s2 + CMPL R8, $0x00000800 + JGE emit_copy_three_lz4_s2 + LEAL -15(DI), DI + MOVB R8, 1(AX) + SHRL $0x08, R8 + SHLL $0x05, R8 + ORL R8, DI + MOVB DI, (AX) + ADDQ $0x02, AX + JMP lz4s_snappy_loop + +emit_copy_three_lz4_s2: + LEAL -2(DI), DI + MOVB DI, (AX) + MOVW R8, 1(AX) + ADDQ $0x03, AX + JMP lz4s_snappy_loop + +lz4s_snappy_done: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ SI, uncompressed+48(FP) + MOVQ AX, dstUsed+56(FP) + RET + +lz4s_snappy_corrupt: + XORQ AX, AX + LEAQ -1(AX), SI + MOVQ SI, uncompressed+48(FP) + RET + +lz4s_snappy_dstfull: + XORQ AX, AX + LEAQ -2(AX), SI + MOVQ SI, uncompressed+48(FP) + RET diff --git a/s2/lz4sconvert.go b/s2/lz4sconvert.go new file mode 100644 index 0000000000..000f39719c --- /dev/null +++ b/s2/lz4sconvert.go @@ -0,0 +1,467 @@ +// Copyright (c) 2022 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "encoding/binary" + "fmt" +) + +// LZ4sConverter provides conversion from LZ4s. +// (Intel modified LZ4 Blocks) +// https://cdrdv2-public.intel.com/743912/743912-qat-programmers-guide-v2.0.pdf +// LZ4s is a variant of LZ4 block format. LZ4s should be considered as an intermediate compressed block format. +// The LZ4s format is selected when the application sets the compType to CPA_DC_LZ4S in CpaDcSessionSetupData. +// The LZ4s block returned by the Intel® QAT hardware can be used by an external +// software post-processing to generate other compressed data formats. +// The following table lists the differences between LZ4 and LZ4s block format. LZ4s block format uses +// the same high-level formatting as LZ4 block format with the following encoding changes: +// For Min Match of 4 bytes, Copy length value 1-15 means length 4-18 with 18 bytes adding an extra byte. +// ONLY "Min match of 4 bytes" is supported. +type LZ4sConverter struct { +} + +// ConvertBlock will convert an LZ4s block and append it as an S2 +// block without block length to dst. +// The uncompressed size is returned as well. +// dst must have capacity to contain the entire compressed block. +func (l *LZ4sConverter) ConvertBlock(dst, src []byte) ([]byte, int, error) { + if len(src) == 0 { + return dst, 0, nil + } + const debug = false + const inline = true + const lz4MinMatch = 3 + + s, d := 0, len(dst) + dst = dst[:cap(dst)] + if !debug && hasAmd64Asm { + res, sz := cvtLZ4sBlockAsm(dst[d:], src) + if res < 0 { + const ( + errCorrupt = -1 + errDstTooSmall = -2 + ) + switch res { + case errCorrupt: + return nil, 0, ErrCorrupt + case errDstTooSmall: + return nil, 0, ErrDstTooSmall + default: + return nil, 0, fmt.Errorf("unexpected result: %d", res) + } + } + if d+sz > len(dst) { + return nil, 0, ErrDstTooSmall + } + return dst[:d+sz], res, nil + } + + dLimit := len(dst) - 10 + var lastOffset uint16 + var uncompressed int + if debug { + fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) + } + + for { + if s >= len(src) { + return dst[:d], 0, ErrCorrupt + } + // Read literal info + token := src[s] + ll := int(token >> 4) + ml := int(lz4MinMatch + (token & 0xf)) + + // If upper nibble is 15, literal length is extended + if token >= 0xf0 { + for { + s++ + if s >= len(src) { + if debug { + fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return dst[:d], 0, ErrCorrupt + } + val := src[s] + ll += int(val) + if val != 255 { + break + } + } + } + // Skip past token + if s+ll >= len(src) { + if debug { + fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src)) + } + return nil, 0, ErrCorrupt + } + s++ + if ll > 0 { + if d+ll > dLimit { + return nil, 0, ErrDstTooSmall + } + if debug { + fmt.Printf("emit %d literals\n", ll) + } + d += emitLiteralGo(dst[d:], src[s:s+ll]) + s += ll + uncompressed += ll + } + + // Check if we are done... + if ml == lz4MinMatch { + if s == len(src) { + break + } + // 0 bytes. + continue + } + // 2 byte offset + if s >= len(src)-2 { + if debug { + fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2) + } + return nil, 0, ErrCorrupt + } + offset := binary.LittleEndian.Uint16(src[s:]) + s += 2 + if offset == 0 { + if debug { + fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s) + } + return nil, 0, ErrCorrupt + } + if int(offset) > uncompressed { + if debug { + fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed) + } + return nil, 0, ErrCorrupt + } + + if ml == lz4MinMatch+15 { + for { + if s >= len(src) { + if debug { + fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return nil, 0, ErrCorrupt + } + val := src[s] + s++ + ml += int(val) + if val != 255 { + if s >= len(src) { + if debug { + fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return nil, 0, ErrCorrupt + } + break + } + } + } + if offset == lastOffset { + if debug { + fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset) + } + if !inline { + d += emitRepeat16(dst[d:], offset, ml) + } else { + length := ml + dst := dst[d:] + for len(dst) > 5 { + // Repeat offset, make length cheaper + length -= 4 + if length <= 4 { + dst[0] = uint8(length)<<2 | tagCopy1 + dst[1] = 0 + d += 2 + break + } + if length < 8 && offset < 2048 { + // Encode WITH offset + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 + d += 2 + break + } + if length < (1<<8)+4 { + length -= 4 + dst[2] = uint8(length) + dst[1] = 0 + dst[0] = 5<<2 | tagCopy1 + d += 3 + break + } + if length < (1<<16)+(1<<8) { + length -= 1 << 8 + dst[3] = uint8(length >> 8) + dst[2] = uint8(length >> 0) + dst[1] = 0 + dst[0] = 6<<2 | tagCopy1 + d += 4 + break + } + const maxRepeat = (1 << 24) - 1 + length -= 1 << 16 + left := 0 + if length > maxRepeat { + left = length - maxRepeat + 4 + length = maxRepeat - 4 + } + dst[4] = uint8(length >> 16) + dst[3] = uint8(length >> 8) + dst[2] = uint8(length >> 0) + dst[1] = 0 + dst[0] = 7<<2 | tagCopy1 + if left > 0 { + d += 5 + emitRepeat16(dst[5:], offset, left) + break + } + d += 5 + break + } + } + } else { + if debug { + fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset) + } + if !inline { + d += emitCopy16(dst[d:], offset, ml) + } else { + length := ml + dst := dst[d:] + for len(dst) > 5 { + // Offset no more than 2 bytes. + if length > 64 { + off := 3 + if offset < 2048 { + // emit 8 bytes as tagCopy1, rest as repeats. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 + length -= 8 + off = 2 + } else { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + } + // Emit remaining as repeats, at least 4 bytes remain. + d += off + emitRepeat16(dst[off:], offset, length) + break + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = uint8(length-1)<<2 | tagCopy2 + d += 3 + break + } + // Emit the remaining copy, encoded as 2 bytes. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + d += 2 + break + } + } + lastOffset = offset + } + uncompressed += ml + if d > dLimit { + return nil, 0, ErrDstTooSmall + } + } + + return dst[:d], uncompressed, nil +} + +// ConvertBlockSnappy will convert an LZ4s block and append it +// as a Snappy block without block length to dst. +// The uncompressed size is returned as well. +// dst must have capacity to contain the entire compressed block. +func (l *LZ4sConverter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) { + if len(src) == 0 { + return dst, 0, nil + } + const debug = false + const lz4MinMatch = 3 + + s, d := 0, len(dst) + dst = dst[:cap(dst)] + // Use assembly when possible + if !debug && hasAmd64Asm { + res, sz := cvtLZ4sBlockSnappyAsm(dst[d:], src) + if res < 0 { + const ( + errCorrupt = -1 + errDstTooSmall = -2 + ) + switch res { + case errCorrupt: + return nil, 0, ErrCorrupt + case errDstTooSmall: + return nil, 0, ErrDstTooSmall + default: + return nil, 0, fmt.Errorf("unexpected result: %d", res) + } + } + if d+sz > len(dst) { + return nil, 0, ErrDstTooSmall + } + return dst[:d+sz], res, nil + } + + dLimit := len(dst) - 10 + var uncompressed int + if debug { + fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) + } + + for { + if s >= len(src) { + return nil, 0, ErrCorrupt + } + // Read literal info + token := src[s] + ll := int(token >> 4) + ml := int(lz4MinMatch + (token & 0xf)) + + // If upper nibble is 15, literal length is extended + if token >= 0xf0 { + for { + s++ + if s >= len(src) { + if debug { + fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return nil, 0, ErrCorrupt + } + val := src[s] + ll += int(val) + if val != 255 { + break + } + } + } + // Skip past token + if s+ll >= len(src) { + if debug { + fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src)) + } + return nil, 0, ErrCorrupt + } + s++ + if ll > 0 { + if d+ll > dLimit { + return nil, 0, ErrDstTooSmall + } + if debug { + fmt.Printf("emit %d literals\n", ll) + } + d += emitLiteralGo(dst[d:], src[s:s+ll]) + s += ll + uncompressed += ll + } + + // Check if we are done... + if ml == lz4MinMatch { + if s == len(src) { + break + } + // 0 bytes. + continue + } + // 2 byte offset + if s >= len(src)-2 { + if debug { + fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2) + } + return nil, 0, ErrCorrupt + } + offset := binary.LittleEndian.Uint16(src[s:]) + s += 2 + if offset == 0 { + if debug { + fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s) + } + return nil, 0, ErrCorrupt + } + if int(offset) > uncompressed { + if debug { + fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed) + } + return nil, 0, ErrCorrupt + } + + if ml == lz4MinMatch+15 { + for { + if s >= len(src) { + if debug { + fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return nil, 0, ErrCorrupt + } + val := src[s] + s++ + ml += int(val) + if val != 255 { + if s >= len(src) { + if debug { + fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src)) + } + return nil, 0, ErrCorrupt + } + break + } + } + } + if debug { + fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset) + } + length := ml + // d += emitCopyNoRepeat(dst[d:], int(offset), ml) + for length > 0 { + if d >= dLimit { + return nil, 0, ErrDstTooSmall + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit a length 64 copy, encoded as 3 bytes. + dst[d+2] = uint8(offset >> 8) + dst[d+1] = uint8(offset) + dst[d+0] = 63<<2 | tagCopy2 + length -= 64 + d += 3 + continue + } + if length >= 12 || offset >= 2048 || length < 4 { + // Emit the remaining copy, encoded as 3 bytes. + dst[d+2] = uint8(offset >> 8) + dst[d+1] = uint8(offset) + dst[d+0] = uint8(length-1)<<2 | tagCopy2 + d += 3 + break + } + // Emit the remaining copy, encoded as 2 bytes. + dst[d+1] = uint8(offset) + dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + d += 2 + break + } + uncompressed += ml + if d > dLimit { + return nil, 0, ErrDstTooSmall + } + } + + return dst[:d], uncompressed, nil +} diff --git a/s2/lz4sconvert_test.go b/s2/lz4sconvert_test.go new file mode 100644 index 0000000000..f34ec26bc0 --- /dev/null +++ b/s2/lz4sconvert_test.go @@ -0,0 +1,102 @@ +// Copyright (c) 2023 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package s2 + +import ( + "bytes" + "encoding/binary" + "path/filepath" + "testing" + + "github.com/klauspost/compress/internal/lz4ref" + "github.com/klauspost/compress/internal/snapref" +) + +func TestLZ4sConverter_ConvertBlock(t *testing.T) { + for _, tf := range testFiles { + t.Run(tf.label, func(t *testing.T) { + if err := downloadBenchmarkFiles(t, tf.filename); err != nil { + t.Fatalf("failed to download testdata: %s", err) + } + + bDir := filepath.FromSlash(*benchdataDir) + data := readFile(t, filepath.Join(bDir, tf.filename)) + if n := tf.sizeLimit; 0 < n && n < len(data) { + data = data[:n] + } + + lz4Data := make([]byte, lz4ref.CompressBlockBound(len(data))) + n, err := lz4ref.CompressBlockLZ4s(data, lz4Data) + if err != nil { + t.Fatal(err) + } + if n == 0 { + t.Skip("incompressible") + return + } + t.Log("input size:", len(data)) + t.Log("lz4 size:", n) + lz4Data = lz4Data[:n] + s2Dst := make([]byte, binary.MaxVarintLen32, MaxEncodedLen(len(data))) + s2Dst = s2Dst[:binary.PutUvarint(s2Dst, uint64(len(data)))] + hdr := len(s2Dst) + + conv := LZ4sConverter{} + + szS := 0 + out, n, err := conv.ConvertBlockSnappy(s2Dst, lz4Data) + if err != nil { + t.Fatal(err) + } + if n != len(data) { + t.Fatalf("length mismatch: want %d, got %d", len(data), n) + } + szS = len(out) - hdr + t.Log("lz4->snappy size:", szS) + + decom, err := snapref.Decode(nil, out) + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(decom, data) { + t.Errorf("output mismatch") + } + + sz := 0 + out, n, err = conv.ConvertBlock(s2Dst, lz4Data) + if err != nil { + t.Fatal(err) + } + if n != len(data) { + t.Fatalf("length mismatch: want %d, got %d", len(data), n) + } + sz = len(out) - hdr + t.Log("lz4->s2 size:", sz) + + decom, err = Decode(nil, out) + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(decom, data) { + t.Errorf("output mismatch") + } + + out2 := Encode(s2Dst[:0], data) + sz2 := len(out2) - hdr + t.Log("s2 (default) size:", sz2) + + out2 = EncodeBetter(s2Dst[:0], data) + sz3 := len(out2) - hdr + t.Log("s2 (better) size:", sz3) + + t.Log("lz4 -> s2 bytes saved:", len(lz4Data)-sz) + t.Log("lz4 -> snappy bytes saved:", len(lz4Data)-szS) + t.Log("data -> s2 (default) bytes saved:", len(lz4Data)-sz2) + t.Log("data -> s2 (better) bytes saved:", len(lz4Data)-sz3) + t.Log("direct data -> s2 (default) compared to converted from lz4:", sz-sz2) + t.Log("direct data -> s2 (better) compared to converted from lz4:", sz-sz3) + }) + } +} From 0bcd5876220c0ee727924d802e11d7b2b20e892f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 3 Mar 2023 16:07:36 +0100 Subject: [PATCH 2/3] Add noasm stubs. --- s2/encode_go.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/s2/encode_go.go b/s2/encode_go.go index d7749d75c2..0d39c7b0e0 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -717,3 +717,11 @@ func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) { func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) { panic("cvtLZ4BlockSnappyAsm should be unreachable") } + +func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) { + panic("cvtLZ4sBlockAsm should be unreachable") +} + +func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) { + panic("cvtLZ4sBlockSnappyAsm should be unreachable") +} From bdcc09bb154e95c3358077ba533b5c6f6006040c Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 3 Mar 2023 16:50:28 +0100 Subject: [PATCH 3/3] Add literal blocks without matches output. --- internal/lz4ref/block.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/internal/lz4ref/block.go b/internal/lz4ref/block.go index 7ac94132a8..c854b86c25 100644 --- a/internal/lz4ref/block.go +++ b/internal/lz4ref/block.go @@ -303,6 +303,7 @@ func (c *Compressor) CompressBlockLZ4s(src, dst []byte) (int, error) { const debug = false const minMatch = 3 + const addExtraLits = 32 // Suboptimal, but test emitting literals without matches. Set to 0 to disable. if debug { fmt.Printf("lz4 block start: len(src): %d, len(dst):%d \n", len(src), len(dst)) @@ -393,7 +394,18 @@ func (c *Compressor) CompressBlockLZ4s(src, dst []byte) (int, error) { break } } - + if addExtraLits > 15 { + // Add X lits. + if lLen > addExtraLits { + dst[di] = 0xf0 + dst[di+1] = byte(int(addExtraLits-15) & 0xff) // hack to compile + di += 2 + copy(dst[di:di+addExtraLits], src[anchor:anchor+lLen]) + di += addExtraLits + lLen -= addExtraLits + anchor += addExtraLits + } + } mLen = si - mLen if di >= len(dst) { return 0, ErrInvalidSourceShortBuffer