From f2498bcfddbce5f67e94d13ef8d8b2117c37cd10 Mon Sep 17 00:00:00 2001 From: "Masih H. Derkani" Date: Fri, 1 Jul 2022 15:49:42 +0100 Subject: [PATCH] Use streaming APIs to verify the hash of blocks in CAR `Inspect` `go-cid` exposes `Sum` API that facilitates calculation of the CID from `[]byte` payload. `go-multihash` now exposes `SumStream` which can calculate digest from `io.Reader` as well as `[]byte`. But, unfortunately the equivalent API does not exist in `go-cid`. To avoid copying the entire block into memory, implement CID calculation using the streaming multihash sum during inspection of CAR payload. --- v2/reader.go | 42 ++++++++++++++++++++++++++++-------------- v2/reader_test.go | 4 ++-- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/v2/reader.go b/v2/reader.go index c3ef3653..cd10b81d 100644 --- a/v2/reader.go +++ b/v2/reader.go @@ -11,6 +11,7 @@ import ( "github.com/ipld/go-car/v2/internal/carv1/util" internalio "github.com/ipld/go-car/v2/internal/io" "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" "github.com/multiformats/go-varint" "golang.org/x/exp/mmap" ) @@ -266,23 +267,36 @@ func (r *Reader) Inspect(validateBlockHash bool) (CarStats, error) { blockLength := sectionLength - uint64(cidLen) if validateBlockHash { - // read the block data, hash it and compare it - buf := make([]byte, blockLength) - if _, err := io.ReadFull(dr, buf); err != nil { - return CarStats{}, err + // Use multihash.SumStream to avoid having to copy the entire block content into memory. + // The SumStream uses a buffered copy to write bytes into the hasher which will take + // advantage of streaming hash calculation depending on the hash function. + // TODO: introduce SumStream in go-cid to simplify the code here. + blockReader := io.LimitReader(dr, int64(blockLength)) + mhl := cp.MhLength + if mhtype == multicodec.Identity { + mhl = -1 } - - hashed, err := cp.Sum(buf) + mh, err := multihash.SumStream(blockReader, cp.MhType, mhl) if err != nil { return CarStats{}, err } - - if !hashed.Equals(c) { - return CarStats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, hashed) + var wantCid cid.Cid + switch cp.Version { + case 0: + wantCid = cid.NewCidV0(mh) + case 1: + wantCid = cid.NewCidV1(cp.Codec, mh) + default: + return CarStats{}, fmt.Errorf("invalid cid version: %d", cp.Version) + } + if !wantCid.Equals(c) { + return CarStats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", wantCid, c) } } else { // otherwise, skip over it - dr.Seek(int64(blockLength), io.SeekCurrent) + if _, err := dr.Seek(int64(blockLength), io.SeekCurrent); err != nil { + return CarStats{}, err + } } stats.BlockCount++ @@ -294,11 +308,11 @@ func (r *Reader) Inspect(validateBlockHash bool) (CarStats, error) { if uint64(cidLen) > stats.MaxCidLength { stats.MaxCidLength = uint64(cidLen) } - if uint64(blockLength) < minBlockLength { - minBlockLength = uint64(blockLength) + if blockLength < minBlockLength { + minBlockLength = blockLength } - if uint64(blockLength) > stats.MaxBlockLength { - stats.MaxBlockLength = uint64(blockLength) + if blockLength > stats.MaxBlockLength { + stats.MaxBlockLength = blockLength } } diff --git a/v2/reader_test.go b/v2/reader_test.go index 85605dad..5686d844 100644 --- a/v2/reader_test.go +++ b/v2/reader_test.go @@ -477,14 +477,14 @@ func TestInspectError(t *testing.T) { // header cid data carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff", validateBlockHash: true, - expectedInspectError: "mismatch in content integrity, expected: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi, got: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm", + expectedInspectError: "mismatch in content integrity, expected: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm, got: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi", }, { name: "IdentityCID", // a case where this _could_ be a valid CAR if we allowed identity CIDs and not matching block contents to exist, there's no block bytes in this // 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"}) carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d", validateBlockHash: true, - expectedInspectError: "mismatch in content integrity, expected: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5, got: baguqeaaa", + expectedInspectError: "mismatch in content integrity, expected: baguqeaaa, got: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5", }, // the bad index tests are manually constructed from this single-block CARv2 by adjusting the Uint32 and Uint64 values in the index: // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset