From 4b3cc06977ec494c7f33a46328f3592a30ccdef9 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Fri, 8 Jul 2022 14:29:58 +0200 Subject: [PATCH] huff0: Pass a single bitReader pointer to asm (#634) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the context object smaller and frees up three registers, which we can use to replace the limitPtr and bufferOrigin stack variables. Benchmark results show a tiny win (Go 1.19beta, Core i7-3770K): name old speed new speed delta Decompress1XTable/digits-8 347MB/s ± 0% 347MB/s ± 0% ~ (p=0.650 n=8+10) Decompress1XTable/gettysburg-8 268MB/s ± 0% 268MB/s ± 0% ~ (p=0.400 n=9+9) Decompress1XTable/twain-8 327MB/s ± 0% 327MB/s ± 1% ~ (p=0.339 n=7+9) Decompress1XTable/low-ent.10k-8 385MB/s ± 0% 385MB/s ± 1% ~ (p=0.510 n=9+10) Decompress1XTable/superlow-ent-10k-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.712 n=8+10) Decompress1XTable/crash2-8 17.3MB/s ± 1% 17.3MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XTable/endzerobits-8 52.9MB/s ± 1% 52.4MB/s ± 0% -0.94% (p=0.000 n=10+10) Decompress1XTable/endnonzero-8 11.4MB/s ± 0% 11.4MB/s ± 1% ~ (p=0.343 n=10+10) Decompress1XTable/case1-8 22.0MB/s ± 0% 22.0MB/s ± 0% ~ (p=0.618 n=9+9) Decompress1XTable/case2-8 18.1MB/s ± 0% 18.1MB/s ± 0% ~ (p=0.348 n=9+9) Decompress1XTable/case3-8 19.1MB/s ± 0% 19.1MB/s ± 0% +0.21% (p=0.048 n=10+10) Decompress1XTable/pngdata.001-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.861 n=9+10) Decompress1XTable/normcount2-8 54.3MB/s ± 1% 54.5MB/s ± 1% ~ (p=0.093 n=10+10) Decompress1XNoTable/digits/100-8 279MB/s ± 0% 280MB/s ± 0% +0.30% (p=0.003 n=10+9) Decompress1XNoTable/digits/10000-8 366MB/s ± 0% 365MB/s ± 0% ~ (p=0.113 n=10+9) Decompress1XNoTable/digits/262143-8 347MB/s ± 0% 347MB/s ± 1% ~ (p=0.739 n=10+10) Decompress1XNoTable/gettysburg/100-8 278MB/s ± 1% 277MB/s ± 1% ~ (p=0.676 n=10+9) Decompress1XNoTable/gettysburg/10000-8 363MB/s ± 1% 362MB/s ± 0% -0.50% (p=0.001 n=10+9) Decompress1XNoTable/gettysburg/262143-8 350MB/s ± 0% 347MB/s ± 0% -0.90% (p=0.000 n=10+8) Decompress1XNoTable/twain/100-8 268MB/s ± 0% 267MB/s ± 0% ~ (p=0.384 n=9+8) Decompress1XNoTable/twain/10000-8 363MB/s ± 0% 362MB/s ± 0% -0.32% (p=0.000 n=9+9) Decompress1XNoTable/twain/262143-8 328MB/s ± 0% 329MB/s ± 0% ~ (p=0.063 n=9+10) Decompress1XNoTable/low-ent.10k/100-8 180MB/s ± 0% 181MB/s ± 0% ~ (p=0.225 n=10+10) Decompress1XNoTable/low-ent.10k/10000-8 385MB/s ± 0% 385MB/s ± 0% ~ (p=0.289 n=10+10) Decompress1XNoTable/low-ent.10k/262143-8 389MB/s ± 1% 389MB/s ± 1% ~ (p=0.971 n=10+10) Decompress1XNoTable/superlow-ent-10k/262143-8 389MB/s ± 0% 390MB/s ± 0% +0.27% (p=0.017 n=9+10) Decompress1XNoTable/crash2/100-8 278MB/s ± 0% 279MB/s ± 1% ~ (p=0.163 n=9+10) Decompress1XNoTable/crash2/10000-8 373MB/s ± 1% 373MB/s ± 0% ~ (p=0.370 n=10+8) Decompress1XNoTable/crash2/262143-8 375MB/s ± 0% 375MB/s ± 0% ~ (p=0.604 n=9+10) Decompress1XNoTable/endzerobits/100-8 180MB/s ± 0% 181MB/s ± 0% +0.26% (p=0.005 n=10+9) Decompress1XNoTable/endzerobits/10000-8 384MB/s ± 0% 385MB/s ± 0% ~ (p=0.914 n=8+10) Decompress1XNoTable/endzerobits/262143-8 389MB/s ± 0% 390MB/s ± 0% ~ (p=0.739 n=10+10) Decompress1XNoTable/endnonzero/100-8 180MB/s ± 1% 180MB/s ± 1% ~ (p=0.926 n=10+10) Decompress1XNoTable/endnonzero/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.965 n=10+8) Decompress1XNoTable/endnonzero/262143-8 390MB/s ± 0% 390MB/s ± 0% ~ (p=0.633 n=8+10) Decompress1XNoTable/case1/100-8 282MB/s ± 0% 283MB/s ± 0% +0.34% (p=0.005 n=10+10) Decompress1XNoTable/case1/10000-8 372MB/s ± 0% 373MB/s ± 0% ~ (p=0.113 n=9+9) Decompress1XNoTable/case1/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/case2/100-8 274MB/s ± 1% 274MB/s ± 0% ~ (p=0.927 n=10+10) Decompress1XNoTable/case2/10000-8 376MB/s ± 0% 376MB/s ± 0% ~ (p=0.408 n=10+8) Decompress1XNoTable/case2/262143-8 376MB/s ± 1% 377MB/s ± 0% ~ (p=1.000 n=10+10) Decompress1XNoTable/case3/100-8 266MB/s ± 0% 265MB/s ± 0% ~ (p=0.113 n=9+10) Decompress1XNoTable/case3/10000-8 372MB/s ± 0% 372MB/s ± 0% ~ (p=0.075 n=10+9) Decompress1XNoTable/case3/262143-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.172 n=10+10) Decompress1XNoTable/pngdata.001/100-8 238MB/s ± 0% 238MB/s ± 0% ~ (p=0.438 n=9+8) Decompress1XNoTable/pngdata.001/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.448 n=10+10) Decompress1XNoTable/pngdata.001/262143-8 378MB/s ± 0% 378MB/s ± 0% ~ (p=0.836 n=10+10) Decompress1XNoTable/normcount2/100-8 281MB/s ± 0% 282MB/s ± 1% ~ (p=0.122 n=8+10) Decompress1XNoTable/normcount2/10000-8 369MB/s ± 1% 369MB/s ± 0% ~ (p=0.912 n=10+10) Decompress1XNoTable/normcount2/262143-8 370MB/s ± 0% 370MB/s ± 1% ~ (p=0.342 n=10+10) Decompress4XNoTable/digits/100-8 197MB/s ± 0% 197MB/s ± 1% ~ (p=0.764 n=10+9) Decompress4XNoTable/digits/10000-8 594MB/s ± 0% 602MB/s ± 1% +1.35% (p=0.000 n=10+10) Decompress4XNoTable/digits/262143-8 570MB/s ± 1% 578MB/s ± 0% +1.30% (p=0.000 n=10+8) Decompress4XNoTable/gettysburg/100-8 258MB/s ± 1% 260MB/s ± 0% +0.59% (p=0.001 n=10+10) Decompress4XNoTable/gettysburg/10000-8 638MB/s ± 0% 641MB/s ± 0% +0.44% (p=0.000 n=9+9) Decompress4XNoTable/gettysburg/262143-8 573MB/s ± 1% 574MB/s ± 0% ~ (p=0.353 n=10+10) Decompress4XNoTable/twain/100-8 214MB/s ± 2% 214MB/s ± 2% ~ (p=0.853 n=10+10) Decompress4XNoTable/twain/10000-8 634MB/s ± 1% 638MB/s ± 0% +0.62% (p=0.000 n=10+10) Decompress4XNoTable/twain/262143-8 513MB/s ± 1% 517MB/s ± 0% +0.85% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/100-8 195MB/s ± 0% 194MB/s ± 0% ~ (p=0.130 n=9+9) Decompress4XNoTable/low-ent.10k/10000-8 635MB/s ± 0% 642MB/s ± 0% +1.19% (p=0.000 n=10+10) Decompress4XNoTable/low-ent.10k/262143-8 675MB/s ± 0% 685MB/s ± 0% +1.51% (p=0.000 n=10+10) Decompress4XNoTable/superlow-ent-10k/262143-8 673MB/s ± 1% 684MB/s ± 0% +1.70% (p=0.000 n=10+10) Decompress4XNoTable/case1/100-8 206MB/s ± 1% 206MB/s ± 0% ~ (p=0.189 n=10+9) Decompress4XNoTable/case1/10000-8 593MB/s ± 0% 601MB/s ± 0% +1.47% (p=0.000 n=10+10) Decompress4XNoTable/case1/262143-8 603MB/s ± 0% 613MB/s ± 0% +1.64% (p=0.000 n=10+10) Decompress4XNoTable/case2/100-8 201MB/s ± 0% 202MB/s ± 1% ~ (p=0.053 n=9+10) Decompress4XNoTable/case2/10000-8 610MB/s ± 0% 618MB/s ± 0% +1.30% (p=0.000 n=9+10) Decompress4XNoTable/case2/262143-8 622MB/s ± 1% 634MB/s ± 0% +1.90% (p=0.000 n=9+8) Decompress4XNoTable/case3/100-8 197MB/s ± 1% 198MB/s ± 0% +0.53% (p=0.001 n=9+10) Decompress4XNoTable/case3/10000-8 606MB/s ± 0% 615MB/s ± 0% +1.49% (p=0.000 n=8+10) Decompress4XNoTable/case3/262143-8 613MB/s ± 1% 622MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XNoTable/pngdata.001/100-8 212MB/s ± 1% 211MB/s ± 0% ~ (p=0.136 n=9+9) Decompress4XNoTable/pngdata.001/10000-8 645MB/s ± 1% 649MB/s ± 1% +0.65% (p=0.000 n=9+10) Decompress4XNoTable/pngdata.001/262143-8 640MB/s ± 1% 649MB/s ± 0% +1.44% (p=0.000 n=10+10) Decompress4XNoTable/normcount2/100-8 260MB/s ± 1% 261MB/s ± 1% ~ (p=0.211 n=10+9) Decompress4XNoTable/normcount2/10000-8 584MB/s ± 1% 591MB/s ± 0% +1.33% (p=0.000 n=9+9) Decompress4XNoTable/normcount2/262143-8 588MB/s ± 1% 596MB/s ± 1% +1.39% (p=0.000 n=10+9) Decompress4XNoTableTableLog8/digits-8 583MB/s ± 1% 592MB/s ± 0% +1.48% (p=0.000 n=10+10) Decompress4XTable/digits-8 580MB/s ± 0% 588MB/s ± 0% +1.33% (p=0.000 n=8+10) Decompress4XTable/gettysburg-8 368MB/s ± 1% 370MB/s ± 0% +0.59% (p=0.017 n=10+9) Decompress4XTable/twain-8 510MB/s ± 0% 515MB/s ± 0% +0.99% (p=0.000 n=9+10) Decompress4XTable/low-ent.10k-8 657MB/s ± 0% 665MB/s ± 0% +1.24% (p=0.000 n=10+10) Decompress4XTable/superlow-ent-10k-8 608MB/s ± 0% 617MB/s ± 1% +1.48% (p=0.000 n=8+10) Decompress4XTable/case1-8 21.1MB/s ± 1% 21.0MB/s ± 2% ~ (p=0.223 n=10+10) Decompress4XTable/case2-8 17.6MB/s ± 0% 17.6MB/s ± 0% ~ (p=0.199 n=9+10) Decompress4XTable/case3-8 18.7MB/s ± 0% 18.7MB/s ± 0% ~ (p=0.557 n=10+8) Decompress4XTable/pngdata.001-8 633MB/s ± 1% 645MB/s ± 0% +1.90% (p=0.000 n=9+10) Decompress4XTable/normcount2-8 49.9MB/s ± 1% 49.5MB/s ± 1% -0.64% (p=0.002 n=10+10) [Geo mean] 270MB/s 271MB/s +0.36% --- huff0/_generate/gen.go | 108 +++--- huff0/decompress_amd64.go | 10 +- huff0/decompress_amd64.s | 686 +++++++++++++++++++------------------- 3 files changed, 382 insertions(+), 422 deletions(-) diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go index 485eed6472..6eab53ff4f 100644 --- a/huff0/_generate/gen.go +++ b/huff0/_generate/gen.go @@ -49,7 +49,7 @@ func (d decompress4x) generateProcedure(name string) { exhausted := GP64() XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - limitPtr := AllocLocal(8) + limit := GP64() bufferOrigin := GP64() peekBits := GP64() @@ -57,25 +57,17 @@ func (d decompress4x) generateProcedure(name string) { dstEvery := GP64() table := GP64() - br0 := GP64() - br1 := GP64() - br2 := GP64() - br3 := GP64() + br := GP64() Comment("Preload values") { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), buffer) - MOVQ(buffer, bufferOrigin) - limit := Load(ctx.Field("limit"), GP64()) - MOVQ(limit, limitPtr) + Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) - Load(ctx.Field("pbr0"), br0) - Load(ctx.Field("pbr1"), br1) - Load(ctx.Field("pbr2"), br2) - Load(ctx.Field("pbr3"), br3) + Load(ctx.Field("pbr"), br) } Comment("Main loop") @@ -83,15 +75,15 @@ func (d decompress4x) generateProcedure(name string) { MOVQ(bufferOrigin, buffer) // Check if we have space - CMPQ(buffer, limitPtr) + CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted) + d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted) + d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted) + d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted) + d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted) ADDQ(U8(2), bufferOrigin) // off += 2 @@ -100,10 +92,9 @@ func (d decompress4x) generateProcedure(name string) { { ctx := Dereference(Param("ctx")) - tmp := Load(ctx.Field("out"), GP64()) - decoded := GP64() - MOVQ(bufferOrigin, decoded) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := bufferOrigin + SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 Store(decoded, ctx.Field("decoded")) @@ -118,6 +109,7 @@ const bitReader_in = 0 const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} const bitReader_value = bitReader_off + 8 const bitReader_bitsRead = bitReader_value + 8 +const bitReader__size = bitReader_bitsRead + 8 func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) @@ -157,9 +149,10 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") MOVW(out.As16(), Mem{Base: buffer}) - Comment("update the bitrader reader structure") - MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) - MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) + Comment("update the bitreader structure") + offset := id * bitReader__size + MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } func (d decompress4x) generateProcedure4x8bit(name string) { @@ -171,33 +164,25 @@ func (d decompress4x) generateProcedure4x8bit(name string) { exhausted := GP64() // Fixed since we need 8H XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - bufferOrigin := AllocLocal(8) - limitPtr := AllocLocal(8) + bufferOrigin := GP64() + limit := GP64() peekBits := GP64() buffer := GP64() dstEvery := GP64() table := GP64() - br0 := GP64() - br1 := GP64() - br2 := GP64() - br3 := GP64() + br := GP64() Comment("Preload values") { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), buffer) - MOVQ(buffer, bufferOrigin) - limit := Load(ctx.Field("limit"), GP64()) - MOVQ(limit, limitPtr) + Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) - Load(ctx.Field("pbr0"), br0) - Load(ctx.Field("pbr1"), br1) - Load(ctx.Field("pbr2"), br2) - Load(ctx.Field("pbr3"), br3) + Load(ctx.Field("pbr"), br) } Comment("Main loop") @@ -205,15 +190,15 @@ func (d decompress4x) generateProcedure4x8bit(name string) { MOVQ(bufferOrigin, buffer) // Check if we have space - CMPQ(buffer, limitPtr) + CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted) + d.decodeFourValues(0, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted) + d.decodeFourValues(1, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted) + d.decodeFourValues(2, br, peekBits, table, buffer, exhausted) ADDQ(dstEvery, buffer) - d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted) + d.decodeFourValues(3, br, peekBits, table, buffer, exhausted) ADDQ(U8(4), bufferOrigin) // off += 4 @@ -222,10 +207,9 @@ func (d decompress4x) generateProcedure4x8bit(name string) { { ctx := Dereference(Param("ctx")) - tmp := Load(ctx.Field("out"), GP64()) - decoded := GP64() - MOVQ(bufferOrigin, decoded) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := bufferOrigin + SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 Store(decoded, ctx.Field("decoded")) @@ -234,7 +218,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { } func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { - brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted) + brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) decompress := func(valID int, outByte reg.Register) { CX := reg.CL @@ -269,9 +253,10 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") MOVL(out.As32(), Mem{Base: buffer}) - Comment("update the bitreader reader structure") - MOVQ(brValue, Mem{Base: br, Disp: bitReader_value}) - MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead}) + Comment("update the bitreader structure") + offset := id * bitReader__size + MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value}) + MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { @@ -281,14 +266,15 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( Commentf("br%d.fillFast32()", id) brValue = GP64() brBitsRead = GP64() - MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue) - MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead) + offset := bitReader__size * id + MOVQ(Mem{Base: br, Disp: offset + bitReader_value}, brValue) + MOVBQZX(Mem{Base: br, Disp: offset + bitReader_bitsRead}, brBitsRead) // We must have at least 2 * max tablelog left CMPQ(brBitsRead, U8(64-atLeast)) JBE(LabelRef("skip_fill" + strconv.Itoa(id))) brOffset := GP64() - MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset) + MOVQ(Mem{Base: br, Disp: offset + bitReader_off}, brOffset) SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32 SUBQ(U8(4), brOffset) // b.off -= 4 @@ -297,7 +283,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( // v = v[:4] // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) tmp := GP64() - MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp) + MOVQ(Mem{Base: br, Disp: offset + bitReader_in}, tmp) Comment("b.value |= uint64(low) << (b.bitsRead & 63)") addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1} @@ -306,7 +292,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( MOVQ(brBitsRead, CX.As64()) SHLQ(CX, tmp.As64()) - MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off}) + MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off}) ORQ(tmp.As64(), brValue) { Commentf("exhausted = exhausted || (br%d.off < 4)", id) @@ -474,11 +460,9 @@ func (d decompress1x) generateProcedure(name string) { { // calculate decoded as current `out` - initial `out` ctx := Dereference(Param("ctx")) - decoded := GP64() - tmp := GP64() - MOVQ(buffer, decoded) - Load(ctx.Field("out"), tmp) - SUBQ(tmp, decoded) + ctxout, _ := ctx.Field("out").Resolve() + decoded := buffer + SUBQ(ctxout.Addr, decoded) Store(decoded, ctx.Field("decoded")) pbr := Dereference(ctx.Field("pbr")) diff --git a/huff0/decompress_amd64.go b/huff0/decompress_amd64.go index 671e630a84..9f3e9f79e2 100644 --- a/huff0/decompress_amd64.go +++ b/huff0/decompress_amd64.go @@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) const fallback8BitSize = 800 type decompress4xContext struct { - pbr0 *bitReaderShifted - pbr1 *bitReaderShifted - pbr2 *bitReaderShifted - pbr3 *bitReaderShifted + pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int @@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { ctx := decompress4xContext{ - pbr0: &br[0], - pbr1: &br[1], - pbr2: &br[2], - pbr3: &br[3], + pbr: &br, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &out[0], dstEvery: dstEvery, diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 6c65c6e2b2..dd1a5aecd6 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -4,45 +4,40 @@ // +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_main_loop_amd64(SB), $8-8 +TEXT ·decompress4x_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX - MOVBQZX 32(AX), SI - MOVQ 40(AX), DI - MOVQ DI, BX - MOVQ 72(AX), CX - MOVQ CX, (SP) - MOVQ 48(AX), R8 - MOVQ 56(AX), R9 - MOVQ (AX), R10 - MOVQ 8(AX), R11 - MOVQ 16(AX), R12 - MOVQ 24(AX), R13 + MOVBQZX 8(AX), DI + MOVQ 16(AX), SI + MOVQ 48(AX), BX + MOVQ 24(AX), R9 + MOVQ 32(AX), R10 + MOVQ (AX), R11 // Main loop main_loop: - MOVQ BX, DI - CMPQ DI, (SP) + MOVQ SI, R8 + CMPQ R8, BX SETGE DL // br0.fillFast32() - MOVQ 32(R10), R14 - MOVBQZX 40(R10), R15 - CMPQ R15, $0x20 + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 JBE skip_fill0 - MOVQ 24(R10), AX - SUBQ $0x20, R15 + MOVQ 24(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R10), BP + MOVQ (R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R10) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 24(R11) + ORQ R14, R12 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 @@ -51,57 +46,57 @@ main_loop: skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R10) - MOVB R15, 40(R10) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 // br1.fillFast32() - MOVQ 32(R11), R14 - MOVBQZX 40(R11), R15 - CMPQ R15, $0x20 + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 JBE skip_fill1 - MOVQ 24(R11), AX - SUBQ $0x20, R15 + MOVQ 72(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R11), BP + MOVQ 48(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R11) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 72(R11) + ORQ R14, R12 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 @@ -110,57 +105,57 @@ skip_fill0: skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R11) - MOVB R15, 40(R11) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 // br2.fillFast32() - MOVQ 32(R12), R14 - MOVBQZX 40(R12), R15 - CMPQ R15, $0x20 + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 JBE skip_fill2 - MOVQ 24(R12), AX - SUBQ $0x20, R15 + MOVQ 120(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R12), BP + MOVQ 96(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R12) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 120(R11) + ORQ R14, R12 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 @@ -169,57 +164,57 @@ skip_fill1: skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R12) - MOVB R15, 40(R12) - ADDQ R8, DI + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 // br3.fillFast32() - MOVQ 32(R13), R14 - MOVBQZX 40(R13), R15 - CMPQ R15, $0x20 + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 JBE skip_fill3 - MOVQ 24(R13), AX - SUBQ $0x20, R15 + MOVQ 168(R11), AX + SUBQ $0x20, R13 SUBQ $0x04, AX - MOVQ (R13), BP + MOVQ 144(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(BP*1), BP - MOVQ R15, CX - SHLQ CL, BP - MOVQ AX, 24(R13) - ORQ BP, R14 + MOVL (AX)(R14*1), R14 + MOVQ R13, CX + SHLQ CL, R14 + MOVQ AX, 168(R11) + ORQ R14, R12 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 @@ -228,149 +223,142 @@ skip_fill2: skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R14, BP - MOVQ SI, CX - SHRQ CL, BP + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ SI, CX - MOVQ R14, BP - SHRQ CL, BP + MOVQ DI, CX + MOVQ R12, R14 + SHRQ CL, R14 // v1 := table[val1&mask] - MOVW (R9)(BP*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R14 - ADDB CL, R15 + SHLQ CL, R12 + ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (DI) + MOVW AX, (R8) - // update the bitrader reader structure - MOVQ R14, 32(R13) - MOVB R15, 40(R13) - ADDQ $0x02, BX + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x02, SI TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ BX, DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), SI + SHLQ $0x02, SI + MOVQ SI, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) -TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 +TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX - MOVBQZX 32(CX), BX - MOVQ 40(CX), SI - MOVQ SI, (SP) - MOVQ 72(CX), DX - MOVQ DX, 8(SP) - MOVQ 48(CX), DI - MOVQ 56(CX), R8 - MOVQ (CX), R9 - MOVQ 8(CX), R10 - MOVQ 16(CX), R11 - MOVQ 24(CX), R12 + MOVBQZX 8(CX), DI + MOVQ 16(CX), BX + MOVQ 48(CX), SI + MOVQ 24(CX), R9 + MOVQ 32(CX), R10 + MOVQ (CX), R11 // Main loop main_loop: - MOVQ (SP), SI - CMPQ SI, 8(SP) + MOVQ BX, R8 + CMPQ R8, SI SETGE DL - // br1000.fillFast32() - MOVQ 32(R9), R13 - MOVBQZX 40(R9), R14 - CMPQ R14, $0x20 - JBE skip_fill1000 - MOVQ 24(R9), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R9), BP + // br0.fillFast32() + MOVQ 32(R11), R12 + MOVBQZX 40(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill0 + MOVQ 24(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ (R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R9) - ORQ BP, R13 - - // exhausted = exhausted || (br1000.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 24(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br0.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1000: +skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br0.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -378,88 +366,88 @@ skip_fill1000: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R9) - MOVB R14, 40(R9) - ADDQ DI, SI - - // br1001.fillFast32() - MOVQ 32(R10), R13 - MOVBQZX 40(R10), R14 - CMPQ R14, $0x20 - JBE skip_fill1001 - MOVQ 24(R10), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R10), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 32(R11) + MOVB R13, 40(R11) + ADDQ R9, R8 + + // br1.fillFast32() + MOVQ 80(R11), R12 + MOVBQZX 88(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill1 + MOVQ 72(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 48(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R10) - ORQ BP, R13 - - // exhausted = exhausted || (br1001.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 72(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br1.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1001: +skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br1.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -467,88 +455,88 @@ skip_fill1001: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R10) - MOVB R14, 40(R10) - ADDQ DI, SI - - // br1002.fillFast32() - MOVQ 32(R11), R13 - MOVBQZX 40(R11), R14 - CMPQ R14, $0x20 - JBE skip_fill1002 - MOVQ 24(R11), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R11), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 80(R11) + MOVB R13, 88(R11) + ADDQ R9, R8 + + // br2.fillFast32() + MOVQ 128(R11), R12 + MOVBQZX 136(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill2 + MOVQ 120(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 96(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R11) - ORQ BP, R13 - - // exhausted = exhausted || (br1002.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 120(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br2.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1002: +skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br2.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -556,88 +544,88 @@ skip_fill1002: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) - - // update the bitreader reader structure - MOVQ R13, 32(R11) - MOVB R14, 40(R11) - ADDQ DI, SI - - // br1003.fillFast32() - MOVQ 32(R12), R13 - MOVBQZX 40(R12), R14 - CMPQ R14, $0x20 - JBE skip_fill1003 - MOVQ 24(R12), R15 - SUBQ $0x20, R14 - SUBQ $0x04, R15 - MOVQ (R12), BP + MOVL AX, (R8) + + // update the bitreader structure + MOVQ R12, 128(R11) + MOVB R13, 136(R11) + ADDQ R9, R8 + + // br3.fillFast32() + MOVQ 176(R11), R12 + MOVBQZX 184(R11), R13 + CMPQ R13, $0x20 + JBE skip_fill3 + MOVQ 168(R11), R14 + SUBQ $0x20, R13 + SUBQ $0x04, R14 + MOVQ 144(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R15)(BP*1), BP - MOVQ R14, CX - SHLQ CL, BP - MOVQ R15, 24(R12) - ORQ BP, R13 - - // exhausted = exhausted || (br1003.off < 4) - CMPQ R15, $0x04 + MOVL (R14)(R15*1), R15 + MOVQ R13, CX + SHLQ CL, R15 + MOVQ R14, 168(R11) + ORQ R15, R12 + + // exhausted = exhausted || (br3.off < 4) + CMPQ R14, $0x04 SETLT AL ORB AL, DL -skip_fill1003: +skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v0 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v1 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v2 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 // val3 := br3.peekTopBits(peekBits) - MOVQ R13, R15 - MOVQ BX, CX - SHRQ CL, R15 + MOVQ R12, R14 + MOVQ DI, CX + SHRQ CL, R14 // v3 := table[val0&mask] - MOVW (R8)(R15*2), CX + MOVW (R10)(R14*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R13 - ADDB CL, R14 + SHLQ CL, R12 + ADDB CL, R13 BSWAPL AX // these four writes get coalesced @@ -645,20 +633,18 @@ skip_fill1003: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (SI) + MOVL AX, (R8) - // update the bitreader reader structure - MOVQ R13, 32(R12) - MOVB R14, 40(R12) - ADDQ $0x04, (SP) + // update the bitreader structure + MOVQ R12, 176(R11) + MOVB R13, 184(R11) + ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - MOVQ 40(AX), CX - MOVQ (SP), DX - SUBQ CX, DX - SHLQ $0x02, DX - MOVQ DX, 64(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) @@ -750,10 +736,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) @@ -847,10 +831,8 @@ loop_condition: // Update ctx structure MOVQ ctx+0(FP), AX - MOVQ DX, CX - MOVQ 16(AX), DX - SUBQ DX, CX - MOVQ CX, 40(AX) + SUBQ 16(AX), DX + MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX)