Skip to content
This repository has been archived by the owner on Jun 20, 2023. It is now read-only.

buzhash: reduce target size and cutoff size #31

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions buzhash.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,32 @@ import (
)

const (
buzMin = 128 << 10
buzMax = 512 << 10
buzMask = 1<<17 - 1
buzMinDefault = 16 << 10
buzMaxDefault = 64 << 10
buzMinLegacy = 128 << 10
buzMaxLegacy = 512 << 10
buzMask = 1<<17 - 1
Copy link

@dbaarda dbaarda May 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that buzMask affects the "target" size, which is the average distance after the min size where chunk boundaries will be found. This setting gives tgt=2^17 or 128K. This means the average chunk boundary will be at 128K + 16K = 144K. Since this is larger than your max of 64K, this means most chunks will be truncated to 64K.

You need to also set buzMaskLegacy=1<<17 -1 and change buzMaskDefault=1<<14 -1 for tgt=16K for a default average block size of 32K. I'd also bump up buzMaxDefault = 128 << 10

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't that be (1<<14)-1?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha! It turns out << does have higher precedence than -. See ipfs/kubo#8952 (comment)

)

type Buzhash struct {
r io.Reader
buf []byte
n int

err error
err error
buzLegacy bool
}

func NewBuzhash(r io.Reader) *Buzhash {
func NewBuzhash(r io.Reader, buzLegacy bool) *Buzhash {
buzMax := buzMaxDefault
if buzLegacy {
buzMax = buzMaxLegacy
}

return &Buzhash{
r: r,
buf: pool.Get(buzMax),
r: r,
buf: pool.Get(buzMax),
buzLegacy: buzLegacy,
}
}

Expand All @@ -37,6 +46,11 @@ func (b *Buzhash) NextBytes() ([]byte, error) {
return nil, b.err
}

buzMin := buzMinDefault
if b.buzLegacy {
buzMin = buzMinLegacy
}

n, err := io.ReadFull(b.r, b.buf[b.n:])
if err != nil {
if err == io.ErrUnexpectedEOF || err == io.EOF {
Expand Down
8 changes: 5 additions & 3 deletions buzhash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ func testBuzhashChunking(t *testing.T, buf []byte) (chunkCount int) {
t.Fatal(err)
}

r := NewBuzhash(bytes.NewReader(buf))
r := NewBuzhash(bytes.NewReader(buf), false)

var chunks [][]byte

buzMin := buzMinDefault

for {
chunk, err := r.NextBytes()
if err != nil {
Expand Down Expand Up @@ -62,14 +64,14 @@ func TestBuzhashChunking(t *testing.T) {

func TestBuzhashChunkReuse(t *testing.T) {
newBuzhash := func(r io.Reader) Splitter {
return NewBuzhash(r)
return NewBuzhash(r, false)
}
testReuse(t, newBuzhash)
}

func BenchmarkBuzhash2(b *testing.B) {
benchmarkChunker(b, func(r io.Reader) Splitter {
return NewBuzhash(r)
return NewBuzhash(r, false)
})
}

Expand Down
5 changes: 4 additions & 1 deletion parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ func FromString(r io.Reader, chunker string) (Splitter, error) {
return parseRabinString(r, chunker)

case chunker == "buzhash":
return NewBuzhash(r), nil
return NewBuzhash(r, false), nil

case chunker == "buzhash-legacy":
return NewBuzhash(r, true), nil

default:
return nil, fmt.Errorf("unrecognized chunker option: %s", chunker)
Expand Down