diff --git a/data/builder/dir_test.go b/data/builder/dir_test.go index 954d2dd..a2b0698 100644 --- a/data/builder/dir_test.go +++ b/data/builder/dir_test.go @@ -3,12 +3,16 @@ package builder import ( "bytes" "fmt" + "os" + "path/filepath" "testing" + "github.com/ipfs/go-cid" "github.com/ipfs/go-unixfsnode" dagpb "github.com/ipld/go-codec-dagpb" "github.com/ipld/go-ipld-prime" cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/stretchr/testify/require" ) func mkEntries(cnt int, ls *ipld.LinkSystem) ([]dagpb.PBLink, error) { @@ -41,7 +45,7 @@ func TestBuildUnixFSDirectory(t *testing.T) { t.Fatal(err) } - dl, err := BuildUnixFSDirectory(entries, &ls) + dl, _, err := BuildUnixFSDirectory(entries, &ls) if err != nil { t.Fatal(err) } @@ -70,3 +74,124 @@ func TestBuildUnixFSDirectory(t *testing.T) { } } } + +func TestBuildUnixFSRecursive(t *testing.T) { + // only the top CID is of interest, but this tree is correct and can be used for future validation + fixture := fentry{ + "rootDir", + "", + mustCidDecode("bafybeihswl3f7pa7fueyayewcvr3clkdz7oetv4jolyejgw26p6l3qzlbm"), + []fentry{ + {"a", "aaa", mustCidDecode("bafkreieygsdw3t5qlsywpjocjfj6xjmmjlejwgw7k7zi6l45bgxra7xi6a"), nil}, + { + "b", + "", + mustCidDecode("bafybeibohj54uixf2mso4t53suyarv6cfuxt6b5cj6qjsqaa2ezfxnu5pu"), + []fentry{ + {"1", "111", mustCidDecode("bafkreihw4cq6flcbsrnjvj77rkfkudhlyevdxteydkjjvvopqefasdqrvy"), nil}, + {"2", "222", mustCidDecode("bafkreie3q4kremt4bhhjdxletm7znjr3oqeo6jt4rtcxcaiu4yuxgdfwd4"), nil}, + }, + }, + {"c", "ccc", mustCidDecode("bafkreide3ksevvet74uks3x7vnxhp4ltfi6zpwbsifmbwn6324fhusia7y"), nil}, + }, + } + + ls := cidlink.DefaultLinkSystem() + storage := cidlink.Memory{} + ls.StorageReadOpener = storage.OpenRead + ls.StorageWriteOpener = storage.OpenWrite + + dir := t.TempDir() + makeFixture(t, dir, fixture) + + lnk, sz, err := BuildUnixFSRecursive(filepath.Join(dir, fixture.name), &ls) + require.NoError(t, err) + require.Equal(t, fixture.expectedLnk.String(), lnk.String()) + require.Equal(t, uint64(245), sz) +} + +func TestBuildUnixFSRecursiveLargeSharded(t *testing.T) { + // only the top CID is of interest, but this tree is correct and can be used for future validation + fixture := fentry{ + "rootDir", + "", + mustCidDecode("bafybeigyvxs6og5jbmpaa43qbhhd5swklqcfzqdrtjgfh53qjon6hpjaye"), + make([]fentry, 0), + } + + for i := 0; i < 1344; i++ { + name := fmt.Sprintf("long name to fill out bytes to make the sharded directory test flip over the sharded directory limit because link names are included in the directory entry %d", i) + fixture.children = append(fixture.children, fentry{name, name, cid.Undef, nil}) + } + + ls := cidlink.DefaultLinkSystem() + storage := cidlink.Memory{} + ls.StorageReadOpener = storage.OpenRead + ls.StorageWriteOpener = storage.OpenWrite + + dir := t.TempDir() + makeFixture(t, dir, fixture) + + lnk, sz, err := BuildUnixFSRecursive(filepath.Join(dir, fixture.name), &ls) + require.NoError(t, err) + require.Equal(t, fixture.expectedLnk.String(), lnk.String()) + require.Equal(t, uint64(515735), sz) +} + +// Same as TestBuildUnixFSRecursiveLargeSharded but it's one file less which flips +// it back to the un-sharded format. So we're testing the boundary condition and +// the proper construction of large DAGs. +func TestBuildUnixFSRecursiveLargeUnsharded(t *testing.T) { + // only the top CID is of interest, but this tree is correct and can be used for future validation + fixture := fentry{ + "rootDir", + "", + mustCidDecode("bafybeihecq4rpl4nw3cgfb2uiwltgsmw5sutouvuldv5fxn4gfbihvnalq"), + make([]fentry, 0), + } + + for i := 0; i < 1343; i++ { + name := fmt.Sprintf("long name to fill out bytes to make the sharded directory test flip over the sharded directory limit because link names are included in the directory entry %d", i) + fixture.children = append(fixture.children, fentry{name, name, cid.Undef, nil}) + } + + ls := cidlink.DefaultLinkSystem() + storage := cidlink.Memory{} + ls.StorageReadOpener = storage.OpenRead + ls.StorageWriteOpener = storage.OpenWrite + + dir := t.TempDir() + makeFixture(t, dir, fixture) + + lnk, sz, err := BuildUnixFSRecursive(filepath.Join(dir, fixture.name), &ls) + require.NoError(t, err) + require.Equal(t, fixture.expectedLnk.String(), lnk.String()) + require.Equal(t, uint64(490665), sz) +} + +type fentry struct { + name string + content string + expectedLnk cid.Cid + children []fentry +} + +func makeFixture(t *testing.T, dir string, fixture fentry) { + path := filepath.Join(dir, fixture.name) + if fixture.children != nil { + require.NoError(t, os.Mkdir(path, 0755)) + for _, c := range fixture.children { + makeFixture(t, path, c) + } + } else { + os.WriteFile(path, []byte(fixture.content), 0644) + } +} + +func mustCidDecode(s string) cid.Cid { + c, err := cid.Decode(s) + if err != nil { + panic(err) + } + return c +} diff --git a/data/builder/directory.go b/data/builder/directory.go index 3ea9e36..afb83e6 100644 --- a/data/builder/directory.go +++ b/data/builder/directory.go @@ -30,6 +30,7 @@ func BuildUnixFSRecursive(root string, ls *ipld.LinkSystem) (ipld.Link, uint64, m := info.Mode() switch { case m.IsDir(): + var tsize uint64 entries, err := os.ReadDir(root) if err != nil { return nil, 0, err @@ -40,27 +41,35 @@ func BuildUnixFSRecursive(root string, ls *ipld.LinkSystem) (ipld.Link, uint64, if err != nil { return nil, 0, err } + tsize += sz entry, err := BuildUnixFSDirectoryEntry(e.Name(), int64(sz), lnk) if err != nil { return nil, 0, err } lnks = append(lnks, entry) } - outLnk, err := BuildUnixFSDirectory(lnks, ls) - return outLnk, 0, err + return BuildUnixFSDirectory(lnks, ls) case m.Type() == fs.ModeSymlink: content, err := os.Readlink(root) if err != nil { return nil, 0, err } - return BuildUnixFSSymlink(content, ls) + outLnk, sz, err := BuildUnixFSSymlink(content, ls) + if err != nil { + return nil, 0, err + } + return outLnk, sz, nil case m.IsRegular(): fp, err := os.Open(root) if err != nil { return nil, 0, err } defer fp.Close() - return BuildUnixFSFile(fp, "", ls) + outLnk, sz, err := BuildUnixFSFile(fp, "", ls) + if err != nil { + return nil, 0, err + } + return outLnk, sz, nil default: return nil, 0, fmt.Errorf("cannot encode non regular file: %s", root) } @@ -87,7 +96,7 @@ func estimateDirSize(entries []dagpb.PBLink) int { } // BuildUnixFSDirectory creates a directory link over a collection of entries. -func BuildUnixFSDirectory(entries []dagpb.PBLink, ls *ipld.LinkSystem) (ipld.Link, error) { +func BuildUnixFSDirectory(entries []dagpb.PBLink, ls *ipld.LinkSystem) (ipld.Link, uint64, error) { if estimateDirSize(entries) > shardSplitThreshold { return BuildUnixFSShardedDirectory(defaultShardWidth, multihash.MURMUR3X64_64, entries, ls) } @@ -95,38 +104,44 @@ func BuildUnixFSDirectory(entries []dagpb.PBLink, ls *ipld.LinkSystem) (ipld.Lin DataType(b, data.Data_Directory) }) if err != nil { - return nil, err + return nil, 0, err } pbb := dagpb.Type.PBNode.NewBuilder() pbm, err := pbb.BeginMap(2) if err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleKey().AssignString("Data"); err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(ufd)); err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleKey().AssignString("Links"); err != nil { - return nil, err + return nil, 0, err } lnks, err := pbm.AssembleValue().BeginList(int64(len(entries))) if err != nil { - return nil, err + return nil, 0, err } // sorting happens in codec-dagpb + var totalSize uint64 for _, e := range entries { + totalSize += uint64(e.Tsize.Must().Int()) if err := lnks.AssembleValue().AssignNode(e); err != nil { - return nil, err + return nil, 0, err } } if err := lnks.Finish(); err != nil { - return nil, err + return nil, 0, err } if err := pbm.Finish(); err != nil { - return nil, err + return nil, 0, err } node := pbb.Build() - return ls.Store(ipld.LinkContext{}, fileLinkProto, node) + lnk, sz, err := sizedStore(ls, fileLinkProto, node) + if err != nil { + return nil, 0, err + } + return lnk, totalSize + sz, err } diff --git a/data/builder/dirshard.go b/data/builder/dirshard.go index a25aa66..299c81f 100644 --- a/data/builder/dirshard.go +++ b/data/builder/dirshard.go @@ -39,7 +39,7 @@ type hamtLink struct { // BuildUnixFSShardedDirectory will build a hamt of unixfs hamt shards encoing a directory with more entries // than is typically allowed to fit in a standard IPFS single-block unixFS directory. -func BuildUnixFSShardedDirectory(size int, hasher uint64, entries []dagpb.PBLink, ls *ipld.LinkSystem) (ipld.Link, error) { +func BuildUnixFSShardedDirectory(size int, hasher uint64, entries []dagpb.PBLink, ls *ipld.LinkSystem) (ipld.Link, uint64, error) { // hash the entries var h hash.Hash var err error @@ -50,13 +50,15 @@ func BuildUnixFSShardedDirectory(size int, hasher uint64, entries []dagpb.PBLink } else { h, err = multihash.GetHasher(hasher) if err != nil { - return nil, err + return nil, 0, err } } hamtEntries := make([]hamtLink, 0, len(entries)) for _, e := range entries { name := e.Name.Must().String() - sum := h.Sum([]byte(name)) + h.Reset() + h.Write([]byte(name)) + sum := h.Sum(nil) hamtEntries = append(hamtEntries, hamtLink{ sum, e, @@ -65,7 +67,7 @@ func BuildUnixFSShardedDirectory(size int, hasher uint64, entries []dagpb.PBLink sizeLg2, err := logtwo(size) if err != nil { - return nil, err + return nil, 0, err } sharder := shard{ @@ -81,7 +83,7 @@ func BuildUnixFSShardedDirectory(size int, hasher uint64, entries []dagpb.PBLink for _, entry := range hamtEntries { err := sharder.add(entry) if err != nil { - return nil, err + return nil, 0, err } } @@ -97,9 +99,11 @@ func (s *shard) add(lnk hamtLink) error { current, ok := s.children[bucket] if !ok { + // no bucket, make one with this entry s.children[bucket] = entry{nil, &lnk} return nil } else if current.shard != nil { + // existing shard, add this link to the shard return current.shard.add(lnk) } // make a shard for current and lnk @@ -114,15 +118,18 @@ func (s *shard) add(lnk hamtLink) error { }, nil, } + // add existing link from this bucket to the new shard if err := newShard.add(*current.hamtLink); err != nil { return err } + // replace bucket with shard s.children[bucket] = newShard + // add new link to the new shard return newShard.add(lnk) } func (s *shard) formatLinkName(name string, idx int) string { - return fmt.Sprintf("%*X%s", s.width, idx, name) + return fmt.Sprintf("%0*X%s", s.width, idx, name) } // bitmap calculates the bitmap of which links in the shard are set. @@ -138,7 +145,7 @@ func (s *shard) bitmap() []byte { // serialize stores the concrete representation of this shard in the link system and // returns a link to it. -func (s *shard) serialize(ls *ipld.LinkSystem) (ipld.Link, error) { +func (s *shard) serialize(ls *ipld.LinkSystem) (ipld.Link, uint64, error) { ufd, err := BuildUnixFS(func(b *Builder) { DataType(b, data.Data_HAMTShard) HashType(b, s.hasher) @@ -146,59 +153,67 @@ func (s *shard) serialize(ls *ipld.LinkSystem) (ipld.Link, error) { Fanout(b, uint64(s.size)) }) if err != nil { - return nil, err + return nil, 0, err } pbb := dagpb.Type.PBNode.NewBuilder() pbm, err := pbb.BeginMap(2) if err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleKey().AssignString("Data"); err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleValue().AssignBytes(data.EncodeUnixFSData(ufd)); err != nil { - return nil, err + return nil, 0, err } if err = pbm.AssembleKey().AssignString("Links"); err != nil { - return nil, err + return nil, 0, err } lnkBuilder := dagpb.Type.PBLinks.NewBuilder() lnks, err := lnkBuilder.BeginList(int64(len(s.children))) if err != nil { - return nil, err + return nil, 0, err } // sorting happens in codec-dagpb + var totalSize uint64 for idx, e := range s.children { var lnk dagpb.PBLink if e.shard != nil { - ipldLnk, err := e.shard.serialize(ls) + ipldLnk, sz, err := e.shard.serialize(ls) if err != nil { - return nil, err + return nil, 0, err } + totalSize += sz fullName := s.formatLinkName("", idx) - lnk, err = BuildUnixFSDirectoryEntry(fullName, 0, ipldLnk) + lnk, err = BuildUnixFSDirectoryEntry(fullName, int64(sz), ipldLnk) if err != nil { - return nil, err + return nil, 0, err } } else { fullName := s.formatLinkName(e.Name.Must().String(), idx) - lnk, err = BuildUnixFSDirectoryEntry(fullName, e.Tsize.Must().Int(), e.Hash.Link()) + sz := e.Tsize.Must().Int() + totalSize += uint64(sz) + lnk, err = BuildUnixFSDirectoryEntry(fullName, sz, e.Hash.Link()) } if err != nil { - return nil, err + return nil, 0, err } if err := lnks.AssembleValue().AssignNode(lnk); err != nil { - return nil, err + return nil, 0, err } } if err := lnks.Finish(); err != nil { - return nil, err + return nil, 0, err } pbm.AssembleValue().AssignNode(lnkBuilder.Build()) if err := pbm.Finish(); err != nil { - return nil, err + return nil, 0, err } node := pbb.Build() - return ls.Store(ipld.LinkContext{}, fileLinkProto, node) + lnk, sz, err := sizedStore(ls, fileLinkProto, node) + if err != nil { + return nil, 0, err + } + return lnk, totalSize + sz, nil } diff --git a/data/builder/file.go b/data/builder/file.go index dd3448b..8e2e6f1 100644 --- a/data/builder/file.go +++ b/data/builder/file.go @@ -86,8 +86,7 @@ func fileTreeRecursive(depth int, children []ipld.Link, childLen []uint64, src c return nil, 0, err } node := basicnode.NewBytes(leaf) - link, err := ls.Store(ipld.LinkContext{}, leafLinkProto, node) - return link, uint64(len(leaf)), err + return sizedStore(ls, leafLinkProto, node) } // depth > 1. totalSize := uint64(0) @@ -166,25 +165,11 @@ func fileTreeRecursive(depth int, children []ipld.Link, childLen []uint64, src c } pbn := dpbb.Build() - link, err := ls.Store(ipld.LinkContext{}, fileLinkProto, pbn) + link, sz, err := sizedStore(ls, fileLinkProto, pbn) if err != nil { return nil, 0, err } - // calculate the dagpb node's size and add as overhead. - cl, ok := link.(cidlink.Link) - if !ok { - return nil, 0, fmt.Errorf("unexpected non-cid linksystem") - } - rawlnk := cid.NewCidV1(uint64(multicodec.Raw), cl.Cid.Hash()) - rn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: rawlnk}, basicnode.Prototype__Bytes{}) - if err != nil { - return nil, 0, fmt.Errorf("could not re-interpret dagpb node as bytes: %w", err) - } - rnb, err := rn.AsBytes() - if err != nil { - return nil, 0, fmt.Errorf("could not parse dagpb node as bytes: %w", err) - } - return link, totalSize + uint64(len(rnb)), nil + return link, totalSize + sz, nil } // BuildUnixFSDirectoryEntry creates the link to a file or directory as it appears within a unixfs directory. @@ -256,25 +241,7 @@ func BuildUnixFSSymlink(content string, ls *ipld.LinkSystem) (ipld.Link, uint64, } pbn := dpbb.Build() - link, err := ls.Store(ipld.LinkContext{}, fileLinkProto, pbn) - if err != nil { - return nil, 0, err - } - // calculate the size and add as overhead. - cl, ok := link.(cidlink.Link) - if !ok { - return nil, 0, fmt.Errorf("unexpected non-cid linksystem") - } - rawlnk := cid.NewCidV1(uint64(multicodec.Raw), cl.Cid.Hash()) - rn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: rawlnk}, basicnode.Prototype__Bytes{}) - if err != nil { - return nil, 0, fmt.Errorf("could not re-interpret dagpb node as bytes: %w", err) - } - rnb, err := rn.AsBytes() - if err != nil { - return nil, 0, fmt.Errorf("could not re-interpret dagpb node as bytes: %w", err) - } - return link, uint64(len(rnb)), nil + return sizedStore(ls, fileLinkProto, pbn) } // Constants below are from diff --git a/data/builder/util.go b/data/builder/util.go index 8e5c0fb..808a5ff 100644 --- a/data/builder/util.go +++ b/data/builder/util.go @@ -2,7 +2,12 @@ package builder import ( "fmt" + "io" "math/bits" + + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" + "github.com/ipld/go-ipld-prime/datamodel" ) // Common code from go-unixfs/hamt/util.go @@ -54,3 +59,51 @@ func logtwo(v int) (int, error) { } return lg2, nil } + +func sizedStore(ls *ipld.LinkSystem, lp datamodel.LinkPrototype, n datamodel.Node) (datamodel.Link, uint64, error) { + var byteCount int + lnk, err := wrappedLinkSystem(ls, func(bc int) { + byteCount = bc + }).Store(ipld.LinkContext{}, lp, n) + return lnk, uint64(byteCount), err +} + +type byteCounter struct { + w io.Writer + bc int +} + +func (bc *byteCounter) Write(p []byte) (int, error) { + bc.bc += len(p) + return bc.w.Write(p) +} + +func wrappedLinkSystem(ls *ipld.LinkSystem, byteCountCb func(byteCount int)) *ipld.LinkSystem { + wrappedEncoder := func(encoder codec.Encoder) codec.Encoder { + return func(node datamodel.Node, writer io.Writer) error { + bc := byteCounter{w: writer} + err := encoder(node, &bc) + if err == nil { + byteCountCb(bc.bc) + } + return err + } + } + wrappedEncoderChooser := func(lp datamodel.LinkPrototype) (codec.Encoder, error) { + encoder, err := ls.EncoderChooser(lp) + if err != nil { + return nil, err + } + return wrappedEncoder(encoder), nil + } + return &ipld.LinkSystem{ + EncoderChooser: wrappedEncoderChooser, + DecoderChooser: ls.DecoderChooser, + HasherChooser: ls.HasherChooser, + StorageWriteOpener: ls.StorageWriteOpener, + StorageReadOpener: ls.StorageReadOpener, + TrustedStorage: ls.TrustedStorage, + NodeReifier: ls.NodeReifier, + KnownReifiers: ls.KnownReifiers, + } +}