diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go index dd111bd8c3..900cba8168 100644 --- a/huff0/_generate/gen.go +++ b/huff0/_generate/gen.go @@ -47,14 +47,11 @@ func (d decompress4x) generateProcedure(name string) { Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") Pragma("noescape") - exhausted := GP64() - XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - + exhausted := GP8() + buffer := GP64() limit := GP64() - bufferOrigin := GP64() peekBits := GP64() - buffer := GP64() dstEvery := GP64() table := GP64() @@ -64,7 +61,7 @@ func (d decompress4x) generateProcedure(name string) { { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("out"), buffer) Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) @@ -74,27 +71,26 @@ func (d decompress4x) generateProcedure(name string) { Comment("Main loop") Label("main_loop") - MOVQ(bufferOrigin, buffer) - // Check if we have space + // Check if we have space. We could zero exhausted outside the loop, + // but doing it here is a hint to the CPU that there's no dependency + // on the previous iteration's value. + XORL(exhausted.As32(), exhausted.As32()) CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted) + d.decodeTwoValues(0, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(1, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(2, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(3, br, peekBits, table, buffer, dstEvery, exhausted) - ADDQ(U8(2), bufferOrigin) // off += 2 + ADDQ(U8(2), buffer) // off += 2 - TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + TESTB(exhausted, exhausted) // any br[i].ofs < 4? JZ(LabelRef("main_loop")) { ctx := Dereference(Param("ctx")) ctxout, _ := ctx.Field("out").Resolve() - decoded := bufferOrigin + decoded := buffer SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 @@ -105,15 +101,14 @@ func (d decompress4x) generateProcedure(name string) { } // TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal -// -// with arbitrary pointers to a given type +// with arbitrary pointers to a given type const bitReader_in = 0 const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} const bitReader_value = bitReader_off + 8 const bitReader_bitsRead = bitReader_value + 8 const bitReader__size = bitReader_bitsRead + 8 -func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { +func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, dstEvery, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) val := GP64() @@ -149,7 +144,7 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau Comment("these two writes get coalesced") Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") - MOVW(out.As16(), Mem{Base: buffer}) + MOVW(out.As16(), bufferIndex(id, buffer, dstEvery)) Comment("update the bitreader structure") offset := id * bitReader__size @@ -163,14 +158,11 @@ func (d decompress4x) generateProcedure4x8bit(name string) { Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") Pragma("noescape") - exhausted := GP64() // Fixed since we need 8H - XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - - bufferOrigin := GP64() + exhausted := GP8() + buffer := GP64() limit := GP64() peekBits := GP64() - buffer := GP64() dstEvery := GP64() table := GP64() @@ -180,7 +172,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("out"), buffer) Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) @@ -190,27 +182,26 @@ func (d decompress4x) generateProcedure4x8bit(name string) { Comment("Main loop") Label("main_loop") - MOVQ(bufferOrigin, buffer) - // Check if we have space + // Check if we have space. We could zero exhausted outside the loop, + // but doing it here is a hint to the CPU that there's no dependency + // on the previous iteration's value. + XORL(exhausted.As32(), exhausted.As32()) CMPQ(buffer, limit) - SETGE(exhausted.As8()) - d.decodeFourValues(0, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(1, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(2, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(3, br, peekBits, table, buffer, exhausted) + SETGE(exhausted) + d.decodeFourValues(0, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(1, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(2, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(3, br, peekBits, table, buffer, dstEvery, exhausted) - ADDQ(U8(4), bufferOrigin) // off += 4 + ADDQ(U8(4), buffer) // off += 4 - TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + TESTB(exhausted, exhausted) // any br[i].ofs < 4? JZ(LabelRef("main_loop")) { ctx := Dereference(Param("ctx")) ctxout, _ := ctx.Field("out").Resolve() - decoded := bufferOrigin + decoded := buffer SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 @@ -219,7 +210,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { RET() } -func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { +func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, dstEvery, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) decompress := func(valID int, outByte reg.Register) { @@ -253,7 +244,7 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)") Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") - MOVL(out.As32(), Mem{Base: buffer}) + MOVL(out.As32(), bufferIndex(id, buffer, dstEvery)) Comment("update the bitreader structure") offset := id * bitReader__size @@ -261,6 +252,21 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } +func bufferIndex(id int, buffer, dstEvery reg.GPVirtual) Mem { + switch id { + case 0: + return Mem{Base: buffer} + case 1, 2: + return Mem{Base: buffer, Index: dstEvery, Scale: byte(id)} + case 3: + stride3 := GP64() // stride3 := 3*dstEvery + LEAQ(Mem{Base: dstEvery, Index: dstEvery, Scale: 2}, stride3) + return Mem{Base: buffer, Index: stride3, Scale: 1} + default: + panic("id must be >=0, <4") + } +} + func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { if atLeast > 32 { panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast)) @@ -297,11 +303,11 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off}) ORQ(tmp.As64(), brValue) { - Commentf("exhausted = exhausted || (br%d.off < 4)", id) + Commentf("exhausted += (br%d.off < 4)", id) CMPQ(brOffset, U8(4)) - tmp = GP64() - SETLT(tmp.As8()) - ORB(tmp.As8(), exhausted.As8()) + // Add carry from brOffset-4. We do this at most four times per iteration, + // and every iteration resets exhausted's lower byte, so it doesn't overflow. + ADCB(I8(0), exhausted) } Label("skip_fill" + strconv.Itoa(id)) @@ -409,7 +415,7 @@ func (d decompress1x) generateProcedure(name string) { outCap := GP64() Load(ctx.Field("outCap"), outCap) CMPQ(outCap, U8(4)) - JB(LabelRef("error_max_decoded_size_exeeded")) + JB(LabelRef("error_max_decoded_size_exceeded")) LEAQ(Mem{Base: buffer, Index: outCap, Scale: 1}, bufferEnd) @@ -432,7 +438,7 @@ func (d decompress1x) generateProcedure(name string) { tmp := GP64() LEAQ(Mem{Base: buffer, Disp: 4}, tmp) CMPQ(tmp, bufferEnd) - JGE(LabelRef("error_max_decoded_size_exeeded")) + JGE(LabelRef("error_max_decoded_size_exceeded")) } decompress := func(id int, out reg.Register) { @@ -474,7 +480,7 @@ func (d decompress1x) generateProcedure(name string) { RET() Comment("Report error") - Label("error_max_decoded_size_exeeded") + Label("error_max_decoded_size_exceeded") { ctx := Dereference(Param("ctx")) tmp := GP64() diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 8d2187a2ce..c4c7ab2d1f 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -4,360 +4,349 @@ // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), AX MOVBQZX 8(AX), DI - MOVQ 16(AX), SI - MOVQ 48(AX), BX - MOVQ 24(AX), R9 - MOVQ 32(AX), R10 - MOVQ (AX), R11 + MOVQ 16(AX), BX + MOVQ 48(AX), SI + MOVQ 24(AX), R8 + MOVQ 32(AX), R9 + MOVQ (AX), R10 // Main loop main_loop: - MOVQ SI, R8 - CMPQ R8, BX + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), AX - SUBQ $0x20, R13 + MOVQ 24(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ (R11), R14 + MOVQ (R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 24(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 24(R10) + ORQ R13, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), AX - SUBQ $0x20, R13 + MOVQ 72(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 48(R11), R14 + MOVQ 48(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 72(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 72(R10) + ORQ R13, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), AX - SUBQ $0x20, R13 + MOVQ 120(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 96(R11), R14 + MOVQ 96(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 120(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 120(R10) + ORQ R13, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), AX - SUBQ $0x20, R13 + MOVQ 168(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 144(R11), R14 + MOVQ 144(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 168(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 168(R10) + ORQ R13, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + LEAQ (R8)(R8*2), CX + MOVW AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) - ADDQ $0x02, SI + MOVQ R11, 176(R10) + MOVB R12, 184(R10) + ADDQ $0x02, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - SUBQ 16(AX), SI - SHLQ $0x02, SI - MOVQ SI, 40(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), CX MOVBQZX 8(CX), DI MOVQ 16(CX), BX MOVQ 48(CX), SI - MOVQ 24(CX), R9 - MOVQ 32(CX), R10 - MOVQ (CX), R11 + MOVQ 24(CX), R8 + MOVQ 32(CX), R9 + MOVQ (CX), R10 // Main loop main_loop: - MOVQ BX, R8 - CMPQ R8, SI + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ (R11), R15 + MOVQ 24(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ (R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 24(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 24(R10) + ORQ R14, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -365,88 +354,86 @@ skip_fill0: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 48(R11), R15 + MOVQ 72(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 48(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 72(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 72(R10) + ORQ R14, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -454,88 +441,86 @@ skip_fill1: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 96(R11), R15 + MOVQ 120(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 96(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 120(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 120(R10) + ORQ R14, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -543,88 +528,86 @@ skip_fill2: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 144(R11), R15 + MOVQ 168(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 144(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 168(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 168(R10) + ORQ R14, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -632,11 +615,12 @@ skip_fill3: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + LEAQ (R8)(R8*2), CX + MOVL AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) + MOVQ R11, 176(R10) + MOVB R12, 184(R10) ADDQ $0x04, BX TESTB DL, DL JZ main_loop @@ -652,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -667,7 +651,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -744,7 +728,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) @@ -757,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -772,7 +756,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -839,7 +823,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX)