Skip to content

Commit

Permalink
Use streaming APIs to verify the hash of blocks in CAR Inspect
Browse files Browse the repository at this point in the history
`go-cid` exposes `Sum` API that facilitates calculation of the CID from
`[]byte` payload. `go-multihash` now exposes `SumStream` which can
calculate digest from `io.Reader` as well as `[]byte`. But,
unfortunately the equivalent API does not exist in `go-cid`.

To avoid copying the entire block into memory, implement CID calculation
using the streaming multihash sum during inspection of CAR payload.
  • Loading branch information
masih committed Jul 6, 2022
1 parent 952fcb9 commit f2498bc
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 16 deletions.
42 changes: 28 additions & 14 deletions v2/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/ipld/go-car/v2/internal/carv1/util"
internalio "github.com/ipld/go-car/v2/internal/io"
"github.com/multiformats/go-multicodec"
"github.com/multiformats/go-multihash"
"github.com/multiformats/go-varint"
"golang.org/x/exp/mmap"
)
Expand Down Expand Up @@ -266,23 +267,36 @@ func (r *Reader) Inspect(validateBlockHash bool) (CarStats, error) {
blockLength := sectionLength - uint64(cidLen)

if validateBlockHash {
// read the block data, hash it and compare it
buf := make([]byte, blockLength)
if _, err := io.ReadFull(dr, buf); err != nil {
return CarStats{}, err
// Use multihash.SumStream to avoid having to copy the entire block content into memory.
// The SumStream uses a buffered copy to write bytes into the hasher which will take
// advantage of streaming hash calculation depending on the hash function.
// TODO: introduce SumStream in go-cid to simplify the code here.
blockReader := io.LimitReader(dr, int64(blockLength))
mhl := cp.MhLength
if mhtype == multicodec.Identity {
mhl = -1
}

hashed, err := cp.Sum(buf)
mh, err := multihash.SumStream(blockReader, cp.MhType, mhl)
if err != nil {
return CarStats{}, err
}

if !hashed.Equals(c) {
return CarStats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, hashed)
var wantCid cid.Cid
switch cp.Version {
case 0:
wantCid = cid.NewCidV0(mh)
case 1:
wantCid = cid.NewCidV1(cp.Codec, mh)
default:
return CarStats{}, fmt.Errorf("invalid cid version: %d", cp.Version)
}
if !wantCid.Equals(c) {
return CarStats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", wantCid, c)
}
} else {
// otherwise, skip over it
dr.Seek(int64(blockLength), io.SeekCurrent)
if _, err := dr.Seek(int64(blockLength), io.SeekCurrent); err != nil {
return CarStats{}, err
}
}

stats.BlockCount++
Expand All @@ -294,11 +308,11 @@ func (r *Reader) Inspect(validateBlockHash bool) (CarStats, error) {
if uint64(cidLen) > stats.MaxCidLength {
stats.MaxCidLength = uint64(cidLen)
}
if uint64(blockLength) < minBlockLength {
minBlockLength = uint64(blockLength)
if blockLength < minBlockLength {
minBlockLength = blockLength
}
if uint64(blockLength) > stats.MaxBlockLength {
stats.MaxBlockLength = uint64(blockLength)
if blockLength > stats.MaxBlockLength {
stats.MaxBlockLength = blockLength
}
}

Expand Down
4 changes: 2 additions & 2 deletions v2/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,14 +477,14 @@ func TestInspectError(t *testing.T) {
// header cid data
carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff",
validateBlockHash: true,
expectedInspectError: "mismatch in content integrity, expected: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi, got: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm",
expectedInspectError: "mismatch in content integrity, expected: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm, got: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi",
},
{
name: "IdentityCID", // a case where this _could_ be a valid CAR if we allowed identity CIDs and not matching block contents to exist, there's no block bytes in this
// 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"})
carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d",
validateBlockHash: true,
expectedInspectError: "mismatch in content integrity, expected: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5, got: baguqeaaa",
expectedInspectError: "mismatch in content integrity, expected: baguqeaaa, got: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5",
},
// the bad index tests are manually constructed from this single-block CARv2 by adjusting the Uint32 and Uint64 values in the index:
// pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset
Expand Down

0 comments on commit f2498bc

Please sign in to comment.