From 20effe55bb0167681337d5cb9b7954443f493d5a Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 4 Apr 2022 11:00:38 -0700 Subject: [PATCH] zstd: Store previous offsets in registers (#548) Not super visible in microbenches (presumably because of cache), but very visible in streaming decodes. Before/after: ``` BenchmarkDecoderSilesia-32 4 281168525 ns/op 753.81 MB/s 50218 B/op 44 allocs/op BenchmarkDecoderSilesia-32 6 190584583 ns/op 1112.09 MB/s 49446 B/op 45 allocs/op BenchmarkDecoderEnwik9-32 1 1439964200 ns/op 694.46 MB/s 71952 B/op 51 allocs/op BenchmarkDecoderEnwik9-32 1 1184307200 ns/op 844.38 MB/s 72144 B/op 53 allocs/op ``` --- zstd/_generate/gen.go | 103 ++++++----- zstd/seqdec_amd64.s | 410 +++++++++++++++++++++++------------------- 2 files changed, 281 insertions(+), 232 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index f866ef0696..3267612c7f 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -146,6 +146,16 @@ func (o options) genDecodeSeqAsm(name string) { mlP := Mem{Base: seqBase, Disp: 1 * 8} // Pointer to current ml llP := Mem{Base: seqBase, Disp: 0 * 8} // Pointer to current ll + // Store previous offsets in registers. + var offsets [3]reg.GPVirtual + s := Dereference(Param("s")) + for i := range offsets { + offsets[i] = GP64() + po, _ := s.Field("prevOffset").Index(i).Resolve() + + MOVQ(po.Addr, offsets[i]) + } + // MAIN LOOP: Label(name + "_main_loop") @@ -209,7 +219,7 @@ func (o options) genDecodeSeqAsm(name string) { Comment("Adjust offset") - offset := o.adjustOffset(name+"_adjust", moP, llP, R14) + offset := o.adjustOffset(name+"_adjust", moP, llP, R14, &offsets) MOVQ(offset, moP) // Store offset Comment("Check values") @@ -265,6 +275,13 @@ func (o options) genDecodeSeqAsm(name string) { DECQ(iterationP.Addr) JNS(LabelRef(name + "_main_loop")) + // Store offsets + s = Dereference(Param("s")) + for i := range offsets { + po, _ := s.Field("prevOffset").Index(i).Resolve() + MOVQ(offsets[i], po.Addr) + } + // update bitreader state before returning br := Dereference(Param("br")) Store(brValue, br.Field("value")) @@ -454,12 +471,7 @@ func (o options) getBits(name string, nBits, brValue, brBitsRead reg.GPVirtual, return BX } -func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual) (offset reg.GPVirtual) { - s := Dereference(Param("s")) - - po0, _ := s.Field("prevOffset").Index(0).Resolve() - po1, _ := s.Field("prevOffset").Index(1).Resolve() - po2, _ := s.Field("prevOffset").Index(2).Resolve() +func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual, offsets *[3]reg.GPVirtual) (offset reg.GPVirtual) { offset = GP64() MOVQ(moP, offset) { @@ -472,10 +484,9 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual) CMPQ(offsetB, U8(1)) JBE(LabelRef(name + "_offsetB_1_or_0")) - tmp := XMM() - MOVUPS(po0.Addr, tmp) // tmp = (s.prevOffset[0], s.prevOffset[1]) - MOVQ(offset, po0.Addr) // s.prevOffset[0] = offset - MOVUPS(tmp, po1.Addr) // s.prevOffset[1], s.prevOffset[2] = s.prevOffset[0], s.prevOffset[1] + MOVQ(offsets[1], offsets[2]) // s.prevOffset[2] = s.prevOffset[1] + MOVQ(offsets[0], offsets[1]) // s.prevOffset[1] = s.prevOffset[0] + MOVQ(offset, offsets[0]) // s.prevOffset[0] = offset JMP(LabelRef(name + "_end")) } @@ -504,7 +515,7 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual) Label(name + "_offset_maybezero") TESTQ(offset, offset) JNZ(LabelRef(name + "_offset_nonzero")) - MOVQ(po0.Addr, offset) + MOVQ(offsets[0], offset) JMP(LabelRef(name + "_end")) } } @@ -515,31 +526,34 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual) // } else { // temp = s.prevOffset[offset] // } - // - // this if got transformed into: - // - // ofs := offset - // shift := 0 - // if offset == 3 { - // ofs = 0 - // shift = -1 - // } - // temp := s.prevOffset[ofs] + shift - // TODO: This should be easier... - CX, DX, R15 := GP64(), GP64(), GP64() - MOVQ(offset, CX) - XORQ(DX, DX) - MOVQ(I32(-1), R15) - CMPQ(offset, U8(3)) - CMOVQEQ(DX, CX) - CMOVQEQ(R15, DX) - prevOffset := GP64() - LEAQ(po0.Addr, prevOffset) // &prevOffset[0] - ADDQ(Mem{Base: prevOffset, Index: CX, Scale: 8}, DX) - temp := DX + temp := GP64() + CMPQ(offset, U8(1)) + JB(LabelRef(name + "_zero")) + JEQ(LabelRef(name + "_one")) + CMPQ(offset, U8(2)) + JA(LabelRef(name + "_three")) + JMP(LabelRef(name + "_two")) + + Label(name + "_zero") + MOVQ(offsets[0], temp) + JMP(LabelRef(name + "_test_temp_valid")) + + Label(name + "_one") + MOVQ(offsets[1], temp) + JMP(LabelRef(name + "_test_temp_valid")) + + Label(name + "_two") + MOVQ(offsets[2], temp) + JMP(LabelRef(name + "_test_temp_valid")) + + Label(name + "_three") + LEAQ(Mem{Base: offsets[0], Disp: -1}, temp) + + Label(name + "_test_temp_valid") // if temp == 0 { // temp = 1 // } + TESTQ(temp, temp) JNZ(LabelRef(name + "_temp_valid")) MOVQ(U32(1), temp) @@ -548,19 +562,18 @@ func (o options) adjustOffset(name string, moP, llP Mem, offsetB reg.GPVirtual) // s.prevOffset[2] = s.prevOffset[1] // } CMPQ(offset, U8(1)) - JZ(LabelRef(name + "_skip")) - tmp := GP64() - MOVQ(po1.Addr, tmp) - MOVQ(tmp, po2.Addr) // s.prevOffset[2] = s.prevOffset[1] - - Label(name + "_skip") + if false { + JZ(LabelRef(name + "_skip")) + MOVQ(offsets[1], offsets[2]) // s.prevOffset[2] = s.prevOffset[1] + Label(name + "_skip") + } else { + CMOVQNE(offsets[1], offsets[2]) + } // s.prevOffset[1] = s.prevOffset[0] // s.prevOffset[0] = temp - tmp = GP64() - MOVQ(po0.Addr, tmp) - MOVQ(tmp, po1.Addr) // s.prevOffset[1] = s.prevOffset[0] - MOVQ(temp, po0.Addr) // s.prevOffset[0] = temp - MOVQ(temp, offset) // return temp + MOVQ(offsets[0], offsets[1]) + MOVQ(temp, offsets[0]) + MOVQ(temp, offset) // return temp } Label(name + "_end") return offset diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 6525415a9b..d727d9c241 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -4,7 +4,7 @@ // +build !appengine,!noasm,gc,!noasm // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: CMOV, SSE +// Requires: CMOV TEXT ·sequenceDecs_decode_amd64(SB), $8-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX @@ -18,9 +18,13 @@ TEXT ·sequenceDecs_decode_amd64(SB), $8-32 MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 104(AX), R10 + MOVQ s+0(FP), AX + MOVQ 144(AX), R11 + MOVQ 152(AX), R12 + MOVQ 160(AX), R13 sequenceDecs_decode_amd64_main_loop: - MOVQ (SP), R11 + MOVQ (SP), R14 // Fill bitreader to have enough for the offset. CMPQ BX, $0x20 @@ -28,10 +32,10 @@ sequenceDecs_decode_amd64_main_loop: CMPQ SI, $0x04 JL sequenceDecs_decode_amd64_fill_byte_by_byte SHLQ $0x20, DX - SUBQ $0x04, R11 + SUBQ $0x04, R14 SUBQ $0x04, SI SUBQ $0x20, BX - MOVLQZX (R11), AX + MOVLQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_end @@ -39,10 +43,10 @@ sequenceDecs_decode_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_end SHLQ $0x08, DX - SUBQ $0x01, R11 + SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX - MOVBQZX (R11), AX + MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_byte_by_byte @@ -50,16 +54,16 @@ sequenceDecs_decode_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX - MOVQ DX, R12 - SHLQ CL, R12 + MOVQ DX, R15 + SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX - SHRQ CL, R12 + SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX - CMOVQEQ CX, R12 - ADDQ R12, AX + CMOVQEQ CX, R15 + ADDQ R15, AX MOVQ AX, 16(R10) // Fill bitreader for match and literal @@ -68,10 +72,10 @@ sequenceDecs_decode_amd64_fill_end: CMPQ SI, $0x04 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte SHLQ $0x20, DX - SUBQ $0x04, R11 + SUBQ $0x04, R14 SUBQ $0x04, SI SUBQ $0x20, BX - MOVLQZX (R11), AX + MOVLQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_2_end @@ -79,10 +83,10 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_2_end SHLQ $0x08, DX - SUBQ $0x01, R11 + SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX - MOVBQZX (R11), AX + MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte @@ -90,31 +94,31 @@ sequenceDecs_decode_amd64_fill_2_end: // Update match length MOVQ R8, AX MOVQ BX, CX - MOVQ DX, R12 - SHLQ CL, R12 + MOVQ DX, R15 + SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX - SHRQ CL, R12 + SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX - CMOVQEQ CX, R12 - ADDQ R12, AX + CMOVQEQ CX, R15 + ADDQ R15, AX MOVQ AX, 8(R10) // Update literal length MOVQ DI, AX MOVQ BX, CX - MOVQ DX, R12 - SHLQ CL, R12 + MOVQ DX, R15 + SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX - SHRQ CL, R12 + SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX - CMOVQEQ CX, R12 - ADDQ R12, AX + CMOVQEQ CX, R15 + ADDQ R15, AX MOVQ AX, (R10) // Fill bitreader for state updates @@ -123,10 +127,10 @@ sequenceDecs_decode_amd64_fill_2_end: CMPQ SI, $0x04 JL sequenceDecs_decode_amd64_fill_3_byte_by_byte SHLQ $0x20, DX - SUBQ $0x04, R11 + SUBQ $0x04, R14 SUBQ $0x04, SI SUBQ $0x20, BX - MOVLQZX (R11), AX + MOVLQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_3_end @@ -134,15 +138,15 @@ sequenceDecs_decode_amd64_fill_3_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_3_end SHLQ $0x08, DX - SUBQ $0x01, R11 + SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX - MOVBQZX (R11), AX + MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_3_byte_by_byte sequenceDecs_decode_amd64_fill_3_end: - MOVQ R11, (SP) + MOVQ R14, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX @@ -151,19 +155,19 @@ sequenceDecs_decode_amd64_fill_3_end: JZ sequenceDecs_decode_amd64_skip_update // Update Literal Length State - MOVBQZX DI, R11 + MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI - CMPQ R11, $0x00 + CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero MOVQ BX, CX - ADDQ R11, BX - MOVQ DX, R12 - SHLQ CL, R12 - MOVQ R11, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX NEGQ CX - SHRQ CL, R12 - ADDQ R12, DI + SHRQ CL, R15 + ADDQ R15, DI sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable @@ -172,19 +176,19 @@ sequenceDecs_decode_amd64_llState_updateState_skip_zero: MOVQ (CX)(DI*8), DI // Update Match Length State - MOVBQZX R8, R11 + MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 - CMPQ R11, $0x00 + CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero MOVQ BX, CX - ADDQ R11, BX - MOVQ DX, R12 - SHLQ CL, R12 - MOVQ R11, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX NEGQ CX - SHRQ CL, R12 - ADDQ R12, R8 + SHRQ CL, R15 + ADDQ R15, R8 sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable @@ -193,19 +197,19 @@ sequenceDecs_decode_amd64_mlState_updateState_skip_zero: MOVQ (CX)(R8*8), R8 // Update Offset State - MOVBQZX R9, R11 + MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 - CMPQ R11, $0x00 + CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero MOVQ BX, CX - ADDQ R11, BX - MOVQ DX, R12 - SHLQ CL, R12 - MOVQ R11, CX + ADDQ R14, BX + MOVQ DX, R15 + SHLQ CL, R15 + MOVQ R14, CX NEGQ CX - SHRQ CL, R12 - ADDQ R12, R9 + SHRQ CL, R15 + ADDQ R15, R9 sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable @@ -215,65 +219,75 @@ sequenceDecs_decode_amd64_ofState_updateState_skip_zero: sequenceDecs_decode_amd64_skip_update: // Adjust offset - MOVQ s+0(FP), CX - MOVQ 16(R10), R11 - CMPQ AX, $0x01 - JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R11, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decode_amd64_adjust_end + MOVQ 16(R10), CX + CMPQ AX, $0x01 + JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 + MOVQ R12, R13 + MOVQ R11, R12 + MOVQ CX, R11 + JMP sequenceDecs_decode_amd64_adjust_end sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero - INCQ R11 + INCQ CX JMP sequenceDecs_decode_amd64_adjust_offset_nonzero sequenceDecs_decode_amd64_adjust_offset_maybezero: - TESTQ R11, R11 + TESTQ CX, CX JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero - MOVQ 144(CX), R11 + MOVQ R11, CX JMP sequenceDecs_decode_amd64_adjust_end sequenceDecs_decode_amd64_adjust_offset_nonzero: - MOVQ R11, AX - XORQ R12, R12 - MOVQ $-1, R13 - CMPQ R11, $0x03 - CMOVQEQ R12, AX - CMOVQEQ R13, R12 - LEAQ 144(CX), R13 - ADDQ (R13)(AX*8), R12 - JNZ sequenceDecs_decode_amd64_adjust_temp_valid - MOVQ $0x00000001, R12 + CMPQ CX, $0x01 + JB sequenceDecs_decode_amd64_adjust_zero + JEQ sequenceDecs_decode_amd64_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_amd64_adjust_three + JMP sequenceDecs_decode_amd64_adjust_two -sequenceDecs_decode_amd64_adjust_temp_valid: - CMPQ R11, $0x01 - JZ sequenceDecs_decode_amd64_adjust_skip - MOVQ 152(CX), AX - MOVQ AX, 160(CX) +sequenceDecs_decode_amd64_adjust_zero: + MOVQ R11, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_one: + MOVQ R12, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_two: + MOVQ R13, AX + JMP sequenceDecs_decode_amd64_adjust_test_temp_valid + +sequenceDecs_decode_amd64_adjust_three: + LEAQ -1(R11), AX -sequenceDecs_decode_amd64_adjust_skip: - MOVQ 144(CX), AX - MOVQ AX, 152(CX) - MOVQ R12, 144(CX) - MOVQ R12, R11 +sequenceDecs_decode_amd64_adjust_test_temp_valid: + TESTQ AX, AX + JNZ sequenceDecs_decode_amd64_adjust_temp_valid + MOVQ $0x00000001, AX + +sequenceDecs_decode_amd64_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R12, R13 + MOVQ R11, R12 + MOVQ AX, R11 + MOVQ AX, CX sequenceDecs_decode_amd64_adjust_end: - MOVQ R11, 16(R10) + MOVQ CX, 16(R10) // Check values MOVQ 8(R10), AX - MOVQ (R10), CX - LEAQ (AX)(CX*1), R12 - MOVQ s+0(FP), R13 - ADDQ R12, 256(R13) - MOVQ ctx+16(FP), R12 - SUBQ CX, 128(R12) + MOVQ (R10), R14 + LEAQ (AX)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) CMPQ AX, $0x00020002 JA sequenceDecs_decode_amd64_error_match_len_too_big - TESTQ R11, R11 + TESTQ CX, CX JNZ sequenceDecs_decode_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch @@ -283,6 +297,10 @@ sequenceDecs_decode_amd64_match_len_ofs_ok: MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decode_amd64_main_loop + MOVQ s+0(FP), AX + MOVQ R11, 144(AX) + MOVQ R12, 152(AX) + MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) @@ -303,7 +321,7 @@ sequenceDecs_decode_amd64_error_match_len_too_big: RET // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: BMI, BMI2, CMOV, SSE +// Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX @@ -317,9 +335,13 @@ TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 104(CX), R9 + MOVQ s+0(FP), CX + MOVQ 144(CX), R10 + MOVQ 152(CX), R11 + MOVQ 160(CX), R12 sequenceDecs_decode_bmi2_main_loop: - MOVQ (SP), R10 + MOVQ (SP), R13 // Fill bitreader to have enough for the offset. CMPQ DX, $0x20 @@ -327,10 +349,10 @@ sequenceDecs_decode_bmi2_main_loop: CMPQ BX, $0x04 JL sequenceDecs_decode_bmi2_fill_byte_by_byte SHLQ $0x20, AX - SUBQ $0x04, R10 + SUBQ $0x04, R13 SUBQ $0x04, BX SUBQ $0x20, DX - MOVLQZX (R10), CX + MOVLQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_end @@ -338,25 +360,25 @@ sequenceDecs_decode_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_end SHLQ $0x08, AX - SUBQ $0x01, R10 + SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX - MOVBQZX (R10), CX + MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_byte_by_byte sequenceDecs_decode_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX - BEXTRQ CX, R8, R11 - MOVQ AX, R12 - LEAQ (DX)(R11*1), CX - ROLQ CL, R12 - BZHIQ R11, R12, R12 + BEXTRQ CX, R8, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX - ADDQ R12, CX + ADDQ R15, CX MOVQ CX, 16(R9) // Fill bitreader for match and literal @@ -365,10 +387,10 @@ sequenceDecs_decode_bmi2_fill_end: CMPQ BX, $0x04 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte SHLQ $0x20, AX - SUBQ $0x04, R10 + SUBQ $0x04, R13 SUBQ $0x04, BX SUBQ $0x20, DX - MOVLQZX (R10), CX + MOVLQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_2_end @@ -376,38 +398,38 @@ sequenceDecs_decode_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_2_end SHLQ $0x08, AX - SUBQ $0x01, R10 + SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX - MOVBQZX (R10), CX + MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte sequenceDecs_decode_bmi2_fill_2_end: // Update match length MOVQ $0x00000808, CX - BEXTRQ CX, DI, R11 - MOVQ AX, R12 - LEAQ (DX)(R11*1), CX - ROLQ CL, R12 - BZHIQ R11, R12, R12 + BEXTRQ CX, DI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX - ADDQ R12, CX + ADDQ R15, CX MOVQ CX, 8(R9) // Update literal length MOVQ $0x00000808, CX - BEXTRQ CX, SI, R11 - MOVQ AX, R12 - LEAQ (DX)(R11*1), CX - ROLQ CL, R12 - BZHIQ R11, R12, R12 + BEXTRQ CX, SI, R14 + MOVQ AX, R15 + LEAQ (DX)(R14*1), CX + ROLQ CL, R15 + BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX - ADDQ R12, CX + ADDQ R15, CX MOVQ CX, (R9) // Fill bitreader for state updates @@ -416,10 +438,10 @@ sequenceDecs_decode_bmi2_fill_2_end: CMPQ BX, $0x04 JL sequenceDecs_decode_bmi2_fill_3_byte_by_byte SHLQ $0x20, AX - SUBQ $0x04, R10 + SUBQ $0x04, R13 SUBQ $0x04, BX SUBQ $0x20, DX - MOVLQZX (R10), CX + MOVLQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_3_end @@ -427,31 +449,31 @@ sequenceDecs_decode_bmi2_fill_3_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_3_end SHLQ $0x08, AX - SUBQ $0x01, R10 + SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX - MOVBQZX (R10), CX + MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_3_byte_by_byte sequenceDecs_decode_bmi2_fill_3_end: - MOVQ R10, (SP) + MOVQ R13, (SP) MOVQ $0x00000808, CX - BEXTRQ CX, R8, R10 + BEXTRQ CX, R8, R13 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_bmi2_skip_update // Update Literal Length State - MOVBQZX SI, R11 + MOVBQZX SI, R14 MOVQ $0x00001010, CX BEXTRQ CX, SI, SI - LEAQ (DX)(R11*1), CX - MOVQ AX, R12 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 MOVQ CX, DX - ROLQ CL, R12 - BZHIQ R11, R12, R12 - ADDQ R12, SI + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, SI // Load ctx.llTable MOVQ ctx+16(FP), CX @@ -459,15 +481,15 @@ sequenceDecs_decode_bmi2_fill_3_end: MOVQ (CX)(SI*8), SI // Update Match Length State - MOVBQZX DI, R11 + MOVBQZX DI, R14 MOVQ $0x00001010, CX BEXTRQ CX, DI, DI - LEAQ (DX)(R11*1), CX - MOVQ AX, R12 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 MOVQ CX, DX - ROLQ CL, R12 - BZHIQ R11, R12, R12 - ADDQ R12, DI + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX @@ -475,15 +497,15 @@ sequenceDecs_decode_bmi2_fill_3_end: MOVQ (CX)(DI*8), DI // Update Offset State - MOVBQZX R8, R11 + MOVBQZX R8, R14 MOVQ $0x00001010, CX BEXTRQ CX, R8, R8 - LEAQ (DX)(R11*1), CX - MOVQ AX, R12 + LEAQ (DX)(R14*1), CX + MOVQ AX, R15 MOVQ CX, DX - ROLQ CL, R12 - BZHIQ R11, R12, R12 - ADDQ R12, R8 + ROLQ CL, R15 + BZHIQ R14, R15, R15 + ADDQ R15, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX @@ -492,67 +514,77 @@ sequenceDecs_decode_bmi2_fill_3_end: sequenceDecs_decode_bmi2_skip_update: // Adjust offset - MOVQ s+0(FP), CX - MOVQ 16(R9), R11 - CMPQ R10, $0x01 - JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R11, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decode_bmi2_adjust_end + MOVQ 16(R9), CX + CMPQ R13, $0x01 + JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 + MOVQ R11, R12 + MOVQ R10, R11 + MOVQ CX, R10 + JMP sequenceDecs_decode_bmi2_adjust_end sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero - INCQ R11 + INCQ CX JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero sequenceDecs_decode_bmi2_adjust_offset_maybezero: - TESTQ R11, R11 + TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero - MOVQ 144(CX), R11 + MOVQ R10, CX JMP sequenceDecs_decode_bmi2_adjust_end sequenceDecs_decode_bmi2_adjust_offset_nonzero: - MOVQ R11, R10 - XORQ R12, R12 - MOVQ $-1, R13 - CMPQ R11, $0x03 - CMOVQEQ R12, R10 - CMOVQEQ R13, R12 - LEAQ 144(CX), R13 - ADDQ (R13)(R10*8), R12 - JNZ sequenceDecs_decode_bmi2_adjust_temp_valid - MOVQ $0x00000001, R12 + CMPQ CX, $0x01 + JB sequenceDecs_decode_bmi2_adjust_zero + JEQ sequenceDecs_decode_bmi2_adjust_one + CMPQ CX, $0x02 + JA sequenceDecs_decode_bmi2_adjust_three + JMP sequenceDecs_decode_bmi2_adjust_two -sequenceDecs_decode_bmi2_adjust_temp_valid: - CMPQ R11, $0x01 - JZ sequenceDecs_decode_bmi2_adjust_skip - MOVQ 152(CX), R10 - MOVQ R10, 160(CX) +sequenceDecs_decode_bmi2_adjust_zero: + MOVQ R10, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_one: + MOVQ R11, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_two: + MOVQ R12, R13 + JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid + +sequenceDecs_decode_bmi2_adjust_three: + LEAQ -1(R10), R13 + +sequenceDecs_decode_bmi2_adjust_test_temp_valid: + TESTQ R13, R13 + JNZ sequenceDecs_decode_bmi2_adjust_temp_valid + MOVQ $0x00000001, R13 -sequenceDecs_decode_bmi2_adjust_skip: - MOVQ 144(CX), R10 - MOVQ R10, 152(CX) - MOVQ R12, 144(CX) - MOVQ R12, R11 +sequenceDecs_decode_bmi2_adjust_temp_valid: + CMPQ CX, $0x01 + CMOVQNE R11, R12 + MOVQ R10, R11 + MOVQ R13, R10 + MOVQ R13, CX sequenceDecs_decode_bmi2_adjust_end: - MOVQ R11, 16(R9) + MOVQ CX, 16(R9) // Check values - MOVQ 8(R9), CX - MOVQ (R9), R10 - LEAQ (CX)(R10*1), R12 - MOVQ s+0(FP), R13 - ADDQ R12, 256(R13) - MOVQ ctx+16(FP), R12 - SUBQ R10, 128(R12) - CMPQ CX, $0x00020002 + MOVQ 8(R9), R13 + MOVQ (R9), R14 + LEAQ (R13)(R14*1), R15 + MOVQ s+0(FP), BP + ADDQ R15, 256(BP) + MOVQ ctx+16(FP), R15 + SUBQ R14, 128(R15) + CMPQ R13, $0x00020002 JA sequenceDecs_decode_bmi2_error_match_len_too_big - TESTQ R11, R11 - JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok TESTQ CX, CX + JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok + TESTQ R13, R13 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch sequenceDecs_decode_bmi2_match_len_ofs_ok: @@ -560,6 +592,10 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok: MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decode_bmi2_main_loop + MOVQ s+0(FP), CX + MOVQ R10, 144(CX) + MOVQ R11, 152(CX) + MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX)