Skip to content

Commit

Permalink
huff0: Pass a single bitReader pointer to asm (#634)
Browse files Browse the repository at this point in the history
This makes the context object smaller and frees up three registers,
which we can use to replace the limitPtr and bufferOrigin stack
variables.

Benchmark results show a tiny win (Go 1.19beta, Core i7-3770K):

	name                                           old speed      new speed      delta
	Decompress1XTable/digits-8                      347MB/s ± 0%   347MB/s ± 0%    ~     (p=0.650 n=8+10)
	Decompress1XTable/gettysburg-8                  268MB/s ± 0%   268MB/s ± 0%    ~     (p=0.400 n=9+9)
	Decompress1XTable/twain-8                       327MB/s ± 0%   327MB/s ± 1%    ~     (p=0.339 n=7+9)
	Decompress1XTable/low-ent.10k-8                 385MB/s ± 0%   385MB/s ± 1%    ~     (p=0.510 n=9+10)
	Decompress1XTable/superlow-ent-10k-8            376MB/s ± 0%   376MB/s ± 0%    ~     (p=0.712 n=8+10)
	Decompress1XTable/crash2-8                     17.3MB/s ± 1%  17.3MB/s ± 1%    ~     (p=0.926 n=10+10)
	Decompress1XTable/endzerobits-8                52.9MB/s ± 1%  52.4MB/s ± 0%  -0.94%  (p=0.000 n=10+10)
	Decompress1XTable/endnonzero-8                 11.4MB/s ± 0%  11.4MB/s ± 1%    ~     (p=0.343 n=10+10)
	Decompress1XTable/case1-8                      22.0MB/s ± 0%  22.0MB/s ± 0%    ~     (p=0.618 n=9+9)
	Decompress1XTable/case2-8                      18.1MB/s ± 0%  18.1MB/s ± 0%    ~     (p=0.348 n=9+9)
	Decompress1XTable/case3-8                      19.1MB/s ± 0%  19.1MB/s ± 0%  +0.21%  (p=0.048 n=10+10)
	Decompress1XTable/pngdata.001-8                 374MB/s ± 0%   374MB/s ± 0%    ~     (p=0.861 n=9+10)
	Decompress1XTable/normcount2-8                 54.3MB/s ± 1%  54.5MB/s ± 1%    ~     (p=0.093 n=10+10)
	Decompress1XNoTable/digits/100-8                279MB/s ± 0%   280MB/s ± 0%  +0.30%  (p=0.003 n=10+9)
	Decompress1XNoTable/digits/10000-8              366MB/s ± 0%   365MB/s ± 0%    ~     (p=0.113 n=10+9)
	Decompress1XNoTable/digits/262143-8             347MB/s ± 0%   347MB/s ± 1%    ~     (p=0.739 n=10+10)
	Decompress1XNoTable/gettysburg/100-8            278MB/s ± 1%   277MB/s ± 1%    ~     (p=0.676 n=10+9)
	Decompress1XNoTable/gettysburg/10000-8          363MB/s ± 1%   362MB/s ± 0%  -0.50%  (p=0.001 n=10+9)
	Decompress1XNoTable/gettysburg/262143-8         350MB/s ± 0%   347MB/s ± 0%  -0.90%  (p=0.000 n=10+8)
	Decompress1XNoTable/twain/100-8                 268MB/s ± 0%   267MB/s ± 0%    ~     (p=0.384 n=9+8)
	Decompress1XNoTable/twain/10000-8               363MB/s ± 0%   362MB/s ± 0%  -0.32%  (p=0.000 n=9+9)
	Decompress1XNoTable/twain/262143-8              328MB/s ± 0%   329MB/s ± 0%    ~     (p=0.063 n=9+10)
	Decompress1XNoTable/low-ent.10k/100-8           180MB/s ± 0%   181MB/s ± 0%    ~     (p=0.225 n=10+10)
	Decompress1XNoTable/low-ent.10k/10000-8         385MB/s ± 0%   385MB/s ± 0%    ~     (p=0.289 n=10+10)
	Decompress1XNoTable/low-ent.10k/262143-8        389MB/s ± 1%   389MB/s ± 1%    ~     (p=0.971 n=10+10)
	Decompress1XNoTable/superlow-ent-10k/262143-8   389MB/s ± 0%   390MB/s ± 0%  +0.27%  (p=0.017 n=9+10)
	Decompress1XNoTable/crash2/100-8                278MB/s ± 0%   279MB/s ± 1%    ~     (p=0.163 n=9+10)
	Decompress1XNoTable/crash2/10000-8              373MB/s ± 1%   373MB/s ± 0%    ~     (p=0.370 n=10+8)
	Decompress1XNoTable/crash2/262143-8             375MB/s ± 0%   375MB/s ± 0%    ~     (p=0.604 n=9+10)
	Decompress1XNoTable/endzerobits/100-8           180MB/s ± 0%   181MB/s ± 0%  +0.26%  (p=0.005 n=10+9)
	Decompress1XNoTable/endzerobits/10000-8         384MB/s ± 0%   385MB/s ± 0%    ~     (p=0.914 n=8+10)
	Decompress1XNoTable/endzerobits/262143-8        389MB/s ± 0%   390MB/s ± 0%    ~     (p=0.739 n=10+10)
	Decompress1XNoTable/endnonzero/100-8            180MB/s ± 1%   180MB/s ± 1%    ~     (p=0.926 n=10+10)
	Decompress1XNoTable/endnonzero/10000-8          384MB/s ± 0%   384MB/s ± 0%    ~     (p=0.965 n=10+8)
	Decompress1XNoTable/endnonzero/262143-8         390MB/s ± 0%   390MB/s ± 0%    ~     (p=0.633 n=8+10)
	Decompress1XNoTable/case1/100-8                 282MB/s ± 0%   283MB/s ± 0%  +0.34%  (p=0.005 n=10+10)
	Decompress1XNoTable/case1/10000-8               372MB/s ± 0%   373MB/s ± 0%    ~     (p=0.113 n=9+9)
	Decompress1XNoTable/case1/262143-8              374MB/s ± 0%   374MB/s ± 0%    ~     (p=0.448 n=10+10)
	Decompress1XNoTable/case2/100-8                 274MB/s ± 1%   274MB/s ± 0%    ~     (p=0.927 n=10+10)
	Decompress1XNoTable/case2/10000-8               376MB/s ± 0%   376MB/s ± 0%    ~     (p=0.408 n=10+8)
	Decompress1XNoTable/case2/262143-8              376MB/s ± 1%   377MB/s ± 0%    ~     (p=1.000 n=10+10)
	Decompress1XNoTable/case3/100-8                 266MB/s ± 0%   265MB/s ± 0%    ~     (p=0.113 n=9+10)
	Decompress1XNoTable/case3/10000-8               372MB/s ± 0%   372MB/s ± 0%    ~     (p=0.075 n=10+9)
	Decompress1XNoTable/case3/262143-8              374MB/s ± 0%   374MB/s ± 0%    ~     (p=0.172 n=10+10)
	Decompress1XNoTable/pngdata.001/100-8           238MB/s ± 0%   238MB/s ± 0%    ~     (p=0.438 n=9+8)
	Decompress1XNoTable/pngdata.001/10000-8         384MB/s ± 0%   384MB/s ± 0%    ~     (p=0.448 n=10+10)
	Decompress1XNoTable/pngdata.001/262143-8        378MB/s ± 0%   378MB/s ± 0%    ~     (p=0.836 n=10+10)
	Decompress1XNoTable/normcount2/100-8            281MB/s ± 0%   282MB/s ± 1%    ~     (p=0.122 n=8+10)
	Decompress1XNoTable/normcount2/10000-8          369MB/s ± 1%   369MB/s ± 0%    ~     (p=0.912 n=10+10)
	Decompress1XNoTable/normcount2/262143-8         370MB/s ± 0%   370MB/s ± 1%    ~     (p=0.342 n=10+10)
	Decompress4XNoTable/digits/100-8                197MB/s ± 0%   197MB/s ± 1%    ~     (p=0.764 n=10+9)
	Decompress4XNoTable/digits/10000-8              594MB/s ± 0%   602MB/s ± 1%  +1.35%  (p=0.000 n=10+10)
	Decompress4XNoTable/digits/262143-8             570MB/s ± 1%   578MB/s ± 0%  +1.30%  (p=0.000 n=10+8)
	Decompress4XNoTable/gettysburg/100-8            258MB/s ± 1%   260MB/s ± 0%  +0.59%  (p=0.001 n=10+10)
	Decompress4XNoTable/gettysburg/10000-8          638MB/s ± 0%   641MB/s ± 0%  +0.44%  (p=0.000 n=9+9)
	Decompress4XNoTable/gettysburg/262143-8         573MB/s ± 1%   574MB/s ± 0%    ~     (p=0.353 n=10+10)
	Decompress4XNoTable/twain/100-8                 214MB/s ± 2%   214MB/s ± 2%    ~     (p=0.853 n=10+10)
	Decompress4XNoTable/twain/10000-8               634MB/s ± 1%   638MB/s ± 0%  +0.62%  (p=0.000 n=10+10)
	Decompress4XNoTable/twain/262143-8              513MB/s ± 1%   517MB/s ± 0%  +0.85%  (p=0.000 n=10+10)
	Decompress4XNoTable/low-ent.10k/100-8           195MB/s ± 0%   194MB/s ± 0%    ~     (p=0.130 n=9+9)
	Decompress4XNoTable/low-ent.10k/10000-8         635MB/s ± 0%   642MB/s ± 0%  +1.19%  (p=0.000 n=10+10)
	Decompress4XNoTable/low-ent.10k/262143-8        675MB/s ± 0%   685MB/s ± 0%  +1.51%  (p=0.000 n=10+10)
	Decompress4XNoTable/superlow-ent-10k/262143-8   673MB/s ± 1%   684MB/s ± 0%  +1.70%  (p=0.000 n=10+10)
	Decompress4XNoTable/case1/100-8                 206MB/s ± 1%   206MB/s ± 0%    ~     (p=0.189 n=10+9)
	Decompress4XNoTable/case1/10000-8               593MB/s ± 0%   601MB/s ± 0%  +1.47%  (p=0.000 n=10+10)
	Decompress4XNoTable/case1/262143-8              603MB/s ± 0%   613MB/s ± 0%  +1.64%  (p=0.000 n=10+10)
	Decompress4XNoTable/case2/100-8                 201MB/s ± 0%   202MB/s ± 1%    ~     (p=0.053 n=9+10)
	Decompress4XNoTable/case2/10000-8               610MB/s ± 0%   618MB/s ± 0%  +1.30%  (p=0.000 n=9+10)
	Decompress4XNoTable/case2/262143-8              622MB/s ± 1%   634MB/s ± 0%  +1.90%  (p=0.000 n=9+8)
	Decompress4XNoTable/case3/100-8                 197MB/s ± 1%   198MB/s ± 0%  +0.53%  (p=0.001 n=9+10)
	Decompress4XNoTable/case3/10000-8               606MB/s ± 0%   615MB/s ± 0%  +1.49%  (p=0.000 n=8+10)
	Decompress4XNoTable/case3/262143-8              613MB/s ± 1%   622MB/s ± 0%  +1.48%  (p=0.000 n=10+10)
	Decompress4XNoTable/pngdata.001/100-8           212MB/s ± 1%   211MB/s ± 0%    ~     (p=0.136 n=9+9)
	Decompress4XNoTable/pngdata.001/10000-8         645MB/s ± 1%   649MB/s ± 1%  +0.65%  (p=0.000 n=9+10)
	Decompress4XNoTable/pngdata.001/262143-8        640MB/s ± 1%   649MB/s ± 0%  +1.44%  (p=0.000 n=10+10)
	Decompress4XNoTable/normcount2/100-8            260MB/s ± 1%   261MB/s ± 1%    ~     (p=0.211 n=10+9)
	Decompress4XNoTable/normcount2/10000-8          584MB/s ± 1%   591MB/s ± 0%  +1.33%  (p=0.000 n=9+9)
	Decompress4XNoTable/normcount2/262143-8         588MB/s ± 1%   596MB/s ± 1%  +1.39%  (p=0.000 n=10+9)
	Decompress4XNoTableTableLog8/digits-8           583MB/s ± 1%   592MB/s ± 0%  +1.48%  (p=0.000 n=10+10)
	Decompress4XTable/digits-8                      580MB/s ± 0%   588MB/s ± 0%  +1.33%  (p=0.000 n=8+10)
	Decompress4XTable/gettysburg-8                  368MB/s ± 1%   370MB/s ± 0%  +0.59%  (p=0.017 n=10+9)
	Decompress4XTable/twain-8                       510MB/s ± 0%   515MB/s ± 0%  +0.99%  (p=0.000 n=9+10)
	Decompress4XTable/low-ent.10k-8                 657MB/s ± 0%   665MB/s ± 0%  +1.24%  (p=0.000 n=10+10)
	Decompress4XTable/superlow-ent-10k-8            608MB/s ± 0%   617MB/s ± 1%  +1.48%  (p=0.000 n=8+10)
	Decompress4XTable/case1-8                      21.1MB/s ± 1%  21.0MB/s ± 2%    ~     (p=0.223 n=10+10)
	Decompress4XTable/case2-8                      17.6MB/s ± 0%  17.6MB/s ± 0%    ~     (p=0.199 n=9+10)
	Decompress4XTable/case3-8                      18.7MB/s ± 0%  18.7MB/s ± 0%    ~     (p=0.557 n=10+8)
	Decompress4XTable/pngdata.001-8                 633MB/s ± 1%   645MB/s ± 0%  +1.90%  (p=0.000 n=9+10)
	Decompress4XTable/normcount2-8                 49.9MB/s ± 1%  49.5MB/s ± 1%  -0.64%  (p=0.002 n=10+10)
	[Geo mean]                                      270MB/s        271MB/s       +0.36%
  • Loading branch information
greatroar authored Jul 8, 2022
1 parent b16a9af commit 4b3cc06
Show file tree
Hide file tree
Showing 3 changed files with 382 additions and 422 deletions.
108 changes: 46 additions & 62 deletions huff0/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,49 +49,41 @@ func (d decompress4x) generateProcedure(name string) {
exhausted := GP64()
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

limitPtr := AllocLocal(8)
limit := GP64()

bufferOrigin := GP64()
peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()
br := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("out"), bufferOrigin)
Load(ctx.Field("limit"), limit)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
Load(ctx.Field("pbr"), br)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
CMPQ(buffer, limit)
SETGE(exhausted.As8())
d.decodeTwoValues(0, br0, peekBits, table, buffer, exhausted)
d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(1, br1, peekBits, table, buffer, exhausted)
d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(2, br2, peekBits, table, buffer, exhausted)
d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeTwoValues(3, br3, peekBits, table, buffer, exhausted)
d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted)

ADDQ(U8(2), bufferOrigin) // off += 2

Expand All @@ -100,10 +92,9 @@ func (d decompress4x) generateProcedure(name string) {

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
ctxout, _ := ctx.Field("out").Resolve()
decoded := bufferOrigin
SUBQ(ctxout.Addr, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
Expand All @@ -118,6 +109,7 @@ const bitReader_in = 0
const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap}
const bitReader_value = bitReader_off + 8
const bitReader_bitsRead = bitReader_value + 8
const bitReader__size = bitReader_bitsRead + 8

func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)
Expand Down Expand Up @@ -157,9 +149,10 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau
Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)")
MOVW(out.As16(), Mem{Base: buffer})

Comment("update the bitrader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
Comment("update the bitreader structure")
offset := id * bitReader__size
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
}

func (d decompress4x) generateProcedure4x8bit(name string) {
Expand All @@ -171,49 +164,41 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
exhausted := GP64() // Fixed since we need 8H
XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false

bufferOrigin := AllocLocal(8)
limitPtr := AllocLocal(8)
bufferOrigin := GP64()
limit := GP64()

peekBits := GP64()
buffer := GP64()
dstEvery := GP64()
table := GP64()

br0 := GP64()
br1 := GP64()
br2 := GP64()
br3 := GP64()
br := GP64()

Comment("Preload values")
{
ctx := Dereference(Param("ctx"))
Load(ctx.Field("peekBits"), peekBits)
Load(ctx.Field("out"), buffer)
MOVQ(buffer, bufferOrigin)
limit := Load(ctx.Field("limit"), GP64())
MOVQ(limit, limitPtr)
Load(ctx.Field("out"), bufferOrigin)
Load(ctx.Field("limit"), limit)
Load(ctx.Field("dstEvery"), dstEvery)
Load(ctx.Field("tbl"), table)
Load(ctx.Field("pbr0"), br0)
Load(ctx.Field("pbr1"), br1)
Load(ctx.Field("pbr2"), br2)
Load(ctx.Field("pbr3"), br3)
Load(ctx.Field("pbr"), br)
}

Comment("Main loop")
Label("main_loop")

MOVQ(bufferOrigin, buffer)
// Check if we have space
CMPQ(buffer, limitPtr)
CMPQ(buffer, limit)
SETGE(exhausted.As8())
d.decodeFourValues(0, br0, peekBits, table, buffer, exhausted)
d.decodeFourValues(0, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(1, br1, peekBits, table, buffer, exhausted)
d.decodeFourValues(1, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(2, br2, peekBits, table, buffer, exhausted)
d.decodeFourValues(2, br, peekBits, table, buffer, exhausted)
ADDQ(dstEvery, buffer)
d.decodeFourValues(3, br3, peekBits, table, buffer, exhausted)
d.decodeFourValues(3, br, peekBits, table, buffer, exhausted)

ADDQ(U8(4), bufferOrigin) // off += 4

Expand All @@ -222,10 +207,9 @@ func (d decompress4x) generateProcedure4x8bit(name string) {

{
ctx := Dereference(Param("ctx"))
tmp := Load(ctx.Field("out"), GP64())
decoded := GP64()
MOVQ(bufferOrigin, decoded)
SUBQ(tmp, decoded)
ctxout, _ := ctx.Field("out").Resolve()
decoded := bufferOrigin
SUBQ(ctxout.Addr, decoded)
SHLQ(U8(2), decoded) // decoded *= 4

Store(decoded, ctx.Field("decoded"))
Expand All @@ -234,7 +218,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) {
}

func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) {
brValue, brBitsRead := d.fillFast32(id+1000, 32, br, exhausted)
brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted)

decompress := func(valID int, outByte reg.Register) {
CX := reg.CL
Expand Down Expand Up @@ -269,9 +253,10 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha
Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)")
MOVL(out.As32(), Mem{Base: buffer})

Comment("update the bitreader reader structure")
MOVQ(brValue, Mem{Base: br, Disp: bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: bitReader_bitsRead})
Comment("update the bitreader structure")
offset := id * bitReader__size
MOVQ(brValue, Mem{Base: br, Disp: offset + bitReader_value})
MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead})
}

func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) {
Expand All @@ -281,14 +266,15 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
Commentf("br%d.fillFast32()", id)
brValue = GP64()
brBitsRead = GP64()
MOVQ(Mem{Base: br, Disp: bitReader_value}, brValue)
MOVBQZX(Mem{Base: br, Disp: bitReader_bitsRead}, brBitsRead)
offset := bitReader__size * id
MOVQ(Mem{Base: br, Disp: offset + bitReader_value}, brValue)
MOVBQZX(Mem{Base: br, Disp: offset + bitReader_bitsRead}, brBitsRead)

// We must have at least 2 * max tablelog left
CMPQ(brBitsRead, U8(64-atLeast))
JBE(LabelRef("skip_fill" + strconv.Itoa(id)))
brOffset := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_off}, brOffset)
MOVQ(Mem{Base: br, Disp: offset + bitReader_off}, brOffset)

SUBQ(U8(32), brBitsRead) // b.bitsRead -= 32
SUBQ(U8(4), brOffset) // b.off -= 4
Expand All @@ -297,7 +283,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
tmp := GP64()
MOVQ(Mem{Base: br, Disp: bitReader_in}, tmp)
MOVQ(Mem{Base: br, Disp: offset + bitReader_in}, tmp)

Comment("b.value |= uint64(low) << (b.bitsRead & 63)")
addr := Mem{Base: brOffset, Index: tmp.As64(), Scale: 1}
Expand All @@ -306,7 +292,7 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (
MOVQ(brBitsRead, CX.As64())
SHLQ(CX, tmp.As64())

MOVQ(brOffset, Mem{Base: br, Disp: bitReader_off})
MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off})
ORQ(tmp.As64(), brValue)
{
Commentf("exhausted = exhausted || (br%d.off < 4)", id)
Expand Down Expand Up @@ -474,11 +460,9 @@ func (d decompress1x) generateProcedure(name string) {
{
// calculate decoded as current `out` - initial `out`
ctx := Dereference(Param("ctx"))
decoded := GP64()
tmp := GP64()
MOVQ(buffer, decoded)
Load(ctx.Field("out"), tmp)
SUBQ(tmp, decoded)
ctxout, _ := ctx.Field("out").Resolve()
decoded := buffer
SUBQ(ctxout.Addr, decoded)
Store(decoded, ctx.Field("decoded"))

pbr := Dereference(ctx.Field("pbr"))
Expand Down
10 changes: 2 additions & 8 deletions huff0/decompress_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@ func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
const fallback8BitSize = 800

type decompress4xContext struct {
pbr0 *bitReaderShifted
pbr1 *bitReaderShifted
pbr2 *bitReaderShifted
pbr3 *bitReaderShifted
pbr *[4]bitReaderShifted
peekBits uint8
out *byte
dstEvery int
Expand Down Expand Up @@ -89,10 +86,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {

if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
ctx := decompress4xContext{
pbr0: &br[0],
pbr1: &br[1],
pbr2: &br[2],
pbr3: &br[3],
pbr: &br,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
out: &out[0],
dstEvery: dstEvery,
Expand Down
Loading

0 comments on commit 4b3cc06

Please sign in to comment.