From e072d34d2a05b1361bf80e6157e3707fe0d49e64 Mon Sep 17 00:00:00 2001 From: Rod Vagg Date: Mon, 6 Mar 2023 22:15:01 +1100 Subject: [PATCH] feat: extract specific path, accept stdin as streaming input --- cmd/car/car.go | 8 +- cmd/car/extract.go | 354 ++++++++++++++------- cmd/car/testdata/script/create-extract.txt | 2 +- 3 files changed, 255 insertions(+), 109 deletions(-) diff --git a/cmd/car/car.go b/cmd/car/car.go index 70c9eb32..81cec5a3 100644 --- a/cmd/car/car.go +++ b/cmd/car/car.go @@ -79,10 +79,16 @@ func main1() int { &cli.StringFlag{ Name: "file", Aliases: []string{"f"}, - Usage: "The car file to extract from", + Usage: "The car file to extract from, or '-' to read from stdin", Required: true, TakesFile: true, }, + &cli.StringFlag{ + Name: "path", + Aliases: []string{"p"}, + Usage: "The unixfs path to extract", + Required: false, + }, &cli.BoolFlag{ Name: "verbose", Aliases: []string{"v"}, diff --git a/cmd/car/extract.go b/cmd/car/extract.go index ae2da875..db4f207f 100644 --- a/cmd/car/extract.go +++ b/cmd/car/extract.go @@ -1,23 +1,27 @@ package main import ( - "bytes" + "context" "errors" "fmt" "io" "os" "path" "path/filepath" + "strings" + "sync" "github.com/ipfs/go-cid" "github.com/ipfs/go-unixfsnode" "github.com/ipfs/go-unixfsnode/data" "github.com/ipfs/go-unixfsnode/file" - "github.com/ipld/go-car/v2/blockstore" + "github.com/ipld/go-car/v2" + carstorage "github.com/ipld/go-car/v2/storage" dagpb "github.com/ipld/go-codec-dagpb" "github.com/ipld/go-ipld-prime" cidlink "github.com/ipld/go-ipld-prime/linking/cid" basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/storage" "github.com/urfave/cli/v2" ) @@ -33,92 +37,109 @@ func ExtractCar(c *cli.Context) error { outputDir = c.Args().First() } - bs, err := blockstore.OpenReadOnly(c.String("file")) - if err != nil { - return err - } + var store storage.ReadableStorage + var roots []cid.Cid - ls := cidlink.DefaultLinkSystem() - ls.TrustedStorage = true - ls.StorageReadOpener = func(_ ipld.LinkContext, l ipld.Link) (io.Reader, error) { - cl, ok := l.(cidlink.Link) - if !ok { - return nil, fmt.Errorf("not a cidlink") + if c.String("file") == "-" { + var err error + store, roots, err = NewStdinReadStorage(c.App.Reader) + if err != nil { + return err } - blk, err := bs.Get(c.Context, cl.Cid) + } else { + carFile, err := os.Open(c.String("file")) if err != nil { - return nil, err + return err + } + store, err = carstorage.OpenReadable(carFile) + if err != nil { + return err } - return bytes.NewBuffer(blk.RawData()), nil + roots = store.(carstorage.ReadableCar).Roots() } - roots, err := bs.Roots() + ls := cidlink.DefaultLinkSystem() + ls.TrustedStorage = true + ls.SetReadStorage(store) + + path, err := pathSegments(c.String("path")) if err != nil { return err } + var extractedFiles int for _, root := range roots { - if err := extractRoot(c, &ls, root, outputDir); err != nil { + count, err := extractRoot(c, &ls, root, outputDir, path) + if err != nil { return err } + extractedFiles += count + } + if extractedFiles == 0 { + fmt.Fprintf(c.App.ErrWriter, "no files extracted\n") + } else { + fmt.Fprintf(c.App.ErrWriter, "extracted %d file(s)\n", extractedFiles) } return nil } -func extractRoot(c *cli.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string) error { +func extractRoot(c *cli.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string, path []string) (int, error) { if root.Prefix().Codec == cid.Raw { if c.IsSet("verbose") { fmt.Fprintf(c.App.ErrWriter, "skipping raw root %s\n", root) } - return nil + return 0, nil } pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode) if err != nil { - return err + return 0, err } pbnode := pbn.(dagpb.PBNode) ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) if err != nil { - return err + return 0, err } outputResolvedDir, err := filepath.EvalSymlinks(outputDir) if err != nil { - return err + return 0, err } if _, err := os.Stat(outputResolvedDir); os.IsNotExist(err) { if err := os.Mkdir(outputResolvedDir, 0755); err != nil { - return err + return 0, err } } - if err := extractDir(c, ls, ufn, outputResolvedDir, "/"); err != nil { + count, err := extractDir(c, ls, ufn, outputResolvedDir, "/", path) + if err != nil { if !errors.Is(err, ErrNotDir) { - return fmt.Errorf("%s: %w", root, err) + return 0, fmt.Errorf("%s: %w", root, err) } + + // if it's not a directory, it's a file. ufsData, err := pbnode.LookupByString("Data") if err != nil { - return err + return 0, err } ufsBytes, err := ufsData.AsBytes() if err != nil { - return err + return 0, err } ufsNode, err := data.DecodeUnixFSData(ufsBytes) if err != nil { - return err + return 0, err } if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { if err := extractFile(c, ls, pbnode, filepath.Join(outputResolvedDir, "unknown")); err != nil { - return err + return 0, err } } - return nil + return 1, nil } - return nil + return count, nil } func resolvePath(root, pth string) (string, error) { @@ -139,99 +160,131 @@ func resolvePath(root, pth string) (string, error) { return joined, nil } -func extractDir(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string) error { +func extractDir(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string, matchPath []string) (int, error) { dirPath, err := resolvePath(outputRoot, outputPath) if err != nil { - return err + return 0, err } // make the directory. if err := os.MkdirAll(dirPath, 0755); err != nil { - return err + return 0, err } - if n.Kind() == ipld.Kind_Map { - mi := n.MapIterator() - for !mi.Done() { - key, val, err := mi.Next() - if err != nil { - return err - } - ks, err := key.AsString() - if err != nil { - return err - } - nextRes, err := resolvePath(outputRoot, path.Join(outputPath, ks)) - if err != nil { - return err - } - if c.IsSet("verbose") { - fmt.Fprintf(c.App.Writer, "%s\n", nextRes) - } + if n.Kind() != ipld.Kind_Map { + return 0, ErrNotDir + } - if val.Kind() != ipld.Kind_Link { - return fmt.Errorf("unexpected map value for %s at %s", ks, outputPath) - } - // a directory may be represented as a map of name: if unixADL is applied - vl, err := val.AsLink() - if err != nil { - return err - } - dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) - if err != nil { - return err + subPath := matchPath + if len(matchPath) > 0 { + subPath = matchPath[1:] + } + + extractElement := func(name string, n ipld.Node) (int, error) { + nextRes, err := resolvePath(outputRoot, path.Join(outputPath, name)) + if err != nil { + return 0, err + } + if c.IsSet("verbose") { + fmt.Fprintf(c.App.Writer, "%s\n", nextRes) + } + + if n.Kind() != ipld.Kind_Link { + return 0, fmt.Errorf("unexpected map value for %s at %s", name, outputPath) + } + // a directory may be represented as a map of name: if unixADL is applied + vl, err := n.AsLink() + if err != nil { + return 0, err + } + dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) + if err != nil { + if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { + fmt.Fprintf(c.App.ErrWriter, "data for directory entry not found: %s (skipping...)\n", name) + return 0, nil } - // degenerate files are handled here. - if dest.Kind() == ipld.Kind_Bytes { - if err := extractFile(c, ls, dest, nextRes); err != nil { - return err - } - continue - } else { - // dir / pbnode - pbb := dagpb.Type.PBNode.NewBuilder() - if err := pbb.AssignNode(dest); err != nil { - return err - } - dest = pbb.Build() + return 0, err + } + // degenerate files are handled here. + if dest.Kind() == ipld.Kind_Bytes { + if err := extractFile(c, ls, dest, nextRes); err != nil { + return 0, err } - pbnode := dest.(dagpb.PBNode) + return 1, nil + } - // interpret dagpb 'data' as unixfs data and look at type. - ufsData, err := pbnode.LookupByString("Data") - if err != nil { - return err - } - ufsBytes, err := ufsData.AsBytes() + // dir / pbnode + pbb := dagpb.Type.PBNode.NewBuilder() + if err := pbb.AssignNode(dest); err != nil { + return 0, err + } + pbnode := pbb.Build().(dagpb.PBNode) + + // interpret dagpb 'data' as unixfs data and look at type. + ufsData, err := pbnode.LookupByString("Data") + if err != nil { + return 0, err + } + ufsBytes, err := ufsData.AsBytes() + if err != nil { + return 0, err + } + ufsNode, err := data.DecodeUnixFSData(ufsBytes) + if err != nil { + return 0, err + } + + switch ufsNode.DataType.Int() { + case data.Data_Directory, data.Data_HAMTShard: + ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) if err != nil { - return err + return 0, err } - ufsNode, err := data.DecodeUnixFSData(ufsBytes) - if err != nil { - return err + return extractDir(c, ls, ufn, outputRoot, path.Join(outputPath, name), subPath) + case data.Data_File, data.Data_Raw: + if err := extractFile(c, ls, pbnode, nextRes); err != nil { + return 0, err } - if ufsNode.DataType.Int() == data.Data_Directory || ufsNode.DataType.Int() == data.Data_HAMTShard { - ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) - if err != nil { - return err - } - - if err := extractDir(c, ls, ufn, outputRoot, path.Join(outputPath, ks)); err != nil { - return err - } - } else if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { - if err := extractFile(c, ls, pbnode, nextRes); err != nil { - return err - } - } else if ufsNode.DataType.Int() == data.Data_Symlink { - data := ufsNode.Data.Must().Bytes() - if err := os.Symlink(string(data), nextRes); err != nil { - return err - } + return 1, nil + case data.Data_Symlink: + data := ufsNode.Data.Must().Bytes() + if err := os.Symlink(string(data), nextRes); err != nil { + return 0, err } + return 1, nil + default: + return 0, fmt.Errorf("unknown unixfs type: %d", ufsNode.DataType.Int()) + } + } + + // specific path segment + if len(matchPath) > 0 { + val, err := n.LookupByString(matchPath[0]) + if err != nil { + return 0, err + } + return extractElement(matchPath[0], val) + } + + // everything + var count int + mi := n.MapIterator() + for !mi.Done() { + key, val, err := mi.Next() + if err != nil { + return 0, err } - return nil + ks, err := key.AsString() + if err != nil { + return 0, err + } + ecount, err := extractElement(ks, val) + if err != nil { + return 0, err + } + count += ecount } - return ErrNotDir + + return count, nil } func extractFile(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputName string) error { @@ -253,3 +306,90 @@ func extractFile(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputName st return err } + +// TODO: dedupe this with lassie, probably into go-unixfsnode +func pathSegments(path string) ([]string, error) { + segments := strings.Split(path, "/") + filtered := make([]string, 0, len(segments)) + for i := 0; i < len(segments); i++ { + if segments[i] == "" { + // Allow one leading and one trailing '/' at most + if i == 0 || i == len(segments)-1 { + continue + } + return nil, fmt.Errorf("invalid empty path segment at position %d", i) + } + if segments[i] == "." || segments[i] == ".." { + return nil, fmt.Errorf("'%s' is unsupported in paths", segments[i]) + } + filtered = append(filtered, segments[i]) + } + return filtered, nil +} + +var _ storage.ReadableStorage = (*stdinReadStorage)(nil) + +type stdinReadStorage struct { + blocks map[string][]byte + done bool + lk *sync.RWMutex + cond *sync.Cond +} + +func NewStdinReadStorage(reader io.Reader) (*stdinReadStorage, []cid.Cid, error) { + var lk sync.RWMutex + srs := &stdinReadStorage{ + blocks: make(map[string][]byte), + lk: &lk, + cond: sync.NewCond(&lk), + } + rdr, err := car.NewBlockReader(reader) + if err != nil { + return nil, nil, err + } + go func() { + for { + blk, err := rdr.Next() + if err == io.EOF { + srs.lk.Lock() + srs.done = true + srs.lk.Unlock() + return + } + if err != nil { + panic(err) + } + srs.lk.Lock() + srs.blocks[string(blk.Cid().Hash())] = blk.RawData() + srs.cond.Broadcast() + srs.lk.Unlock() + } + }() + return srs, rdr.Roots, nil +} + +func (srs *stdinReadStorage) Has(ctx context.Context, key string) (bool, error) { + _, err := srs.Get(ctx, key) + if err != nil { + return false, err + } + return true, nil +} + +func (srs *stdinReadStorage) Get(ctx context.Context, key string) ([]byte, error) { + c, err := cid.Cast([]byte(key)) + if err != nil { + return nil, err + } + srs.lk.Lock() + defer srs.lk.Unlock() + for { + if data, ok := srs.blocks[string(c.Hash())]; ok { + return data, nil + } + if srs.done { + return nil, carstorage.ErrNotFound{Cid: c} + } + srs.cond.Wait() + } +} diff --git a/cmd/car/testdata/script/create-extract.txt b/cmd/car/testdata/script/create-extract.txt index 648bafc4..6dac510b 100644 --- a/cmd/car/testdata/script/create-extract.txt +++ b/cmd/car/testdata/script/create-extract.txt @@ -1,8 +1,8 @@ car create --file=out.car foo.txt bar.txt mkdir out car extract -v -f out.car out -! stderr . stdout -count=2 'txt$' +stderr -count=1 '^extracted 2 file\(s\)$' car create --file=out2.car out/foo.txt out/bar.txt cmp out.car out2.car