diff --git a/.github/workflows/go-fuzz.yml b/.github/workflows/go-fuzz.yml new file mode 100644 index 00000000..830fc9ec --- /dev/null +++ b/.github/workflows/go-fuzz.yml @@ -0,0 +1,46 @@ +on: [ push, pull_request ] +name: Go Fuzz + +jobs: + v1: + strategy: + fail-fast: true + matrix: + target: [ "CarReader" ] + runs-on: ubuntu-latest + name: Fuzz V1 ${{ matrix.target }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions/setup-go@v2 + with: + go-version: 1.18.x + - name: Go information + run: | + go version + go env + - name: Run Fuzzing for 1m + run: go test -v -fuzz=Fuzz${{ matrix.target }} -fuzztime=1m . + v2: + strategy: + fail-fast: true + matrix: + target: [ "BlockReader", "Reader", "Inspect" ] + runs-on: ubuntu-latest + name: Fuzz V2 ${{ matrix.target }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions/setup-go@v2 + with: + go-version: 1.18.x + - name: Go information + run: | + go version + go env + - name: Run Fuzzing for 1m + run: | + cd v2 + go test -v -fuzz=Fuzz${{ matrix.target }} -fuzztime=1m . diff --git a/car_test.go b/car_test.go index 9ae30909..3c6340be 100644 --- a/car_test.go +++ b/car_test.go @@ -75,9 +75,11 @@ func TestRoundtrip(t *testing.T) { } } +// fixture is a clean single-block, single-root CAR +const fixtureStr = "3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5" + func TestEOFHandling(t *testing.T) { - // fixture is a clean single-block, single-root CAR - fixture, err := hex.DecodeString("3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5") + fixture, err := hex.DecodeString(fixtureStr) if err != nil { t.Fatal(err) } diff --git a/fuzz_test.go b/fuzz_test.go new file mode 100644 index 00000000..8ac04bbd --- /dev/null +++ b/fuzz_test.go @@ -0,0 +1,35 @@ +//go:build go1.18 +// +build go1.18 + +package car_test + +import ( + "bytes" + "encoding/hex" + "io" + "testing" + + car "github.com/ipld/go-car" +) + +func FuzzCarReader(f *testing.F) { + fixture, err := hex.DecodeString(fixtureStr) + if err != nil { + f.Fatal(err) + } + f.Add(fixture) + + f.Fuzz(func(t *testing.T, data []byte) { + r, err := car.NewCarReader(bytes.NewReader(data)) + if err != nil { + return + } + + for { + _, err = r.Next() + if err == io.EOF { + return + } + } + }) +} diff --git a/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 b/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 new file mode 100644 index 00000000..a7ab1d51 --- /dev/null +++ b/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xe0\xe0\xe0\xe0\xa7\x06\folLʔ<#oK\x19g#H\x96\b\xed\xb4*\x8b\x8f\xa8\vgversion\x19") diff --git a/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e b/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e new file mode 100644 index 00000000..3e680cf6 --- /dev/null +++ b/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte(":\xa2eroots\x81\xd80X%\x00\x0100 00000000000000000000000000000000gversion\x01\x010") diff --git a/util/util.go b/util/util.go index 08048f33..af2f38e4 100644 --- a/util/util.go +++ b/util/util.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "encoding/binary" + "errors" "fmt" "io" @@ -11,6 +12,12 @@ import ( mh "github.com/multiformats/go-multihash" ) +// MaxAllowedSectionSize dictates the maximum number of bytes that a CARv1 header +// or section is allowed to occupy without causing a decode to error. +// This cannot be supplied as an option, only adjusted as a global. You should +// use v2#NewReader instead since it allows for options to be passed in. +var MaxAllowedSectionSize uint = 32 << 20 // 32MiB + var cidv0Pref = []byte{0x12, 0x20} type BytesReader interface { @@ -18,11 +25,15 @@ type BytesReader interface { io.ByteReader } -// TODO: this belongs in the go-cid package +// Deprecated: ReadCid shouldn't be used directly, use CidFromReader from go-cid func ReadCid(buf []byte) (cid.Cid, int, error) { - if bytes.Equal(buf[:2], cidv0Pref) { - c, err := cid.Cast(buf[:34]) - return c, 34, err + if len(buf) >= 2 && bytes.Equal(buf[:2], cidv0Pref) { + i := 34 + if len(buf) < i { + i = len(buf) + } + c, err := cid.Cast(buf[:i]) + return c, i, err } br := bytes.NewReader(buf) @@ -58,7 +69,7 @@ func ReadNode(br *bufio.Reader) (cid.Cid, []byte, error) { return cid.Cid{}, nil, err } - c, n, err := ReadCid(data) + n, c, err := cid.CidFromReader(bytes.NewReader(data)) if err != nil { return cid.Cid{}, nil, err } @@ -112,6 +123,10 @@ func LdRead(r *bufio.Reader) ([]byte, error) { return nil, err } + if l > uint64(MaxAllowedSectionSize) { // Don't OOM + return nil, errors.New("malformed car; header is bigger than util.MaxAllowedSectionSize") + } + buf := make([]byte, l) if _, err := io.ReadFull(r, buf); err != nil { return nil, err diff --git a/v2/bench_test.go b/v2/bench_test.go index 6e766935..2f7dcc63 100644 --- a/v2/bench_test.go +++ b/v2/bench_test.go @@ -108,7 +108,11 @@ func BenchmarkExtractV1UsingReader(b *testing.B) { if err != nil { b.Fatal(err) } - _, err = io.Copy(dst, reader.DataReader()) + dr, err := reader.DataReader() + if err != nil { + b.Fatal(err) + } + _, err = io.Copy(dst, dr) if err != nil { b.Fatal(err) } @@ -119,6 +123,57 @@ func BenchmarkExtractV1UsingReader(b *testing.B) { }) } +// BenchmarkReader_InspectWithBlockValidation benchmarks Reader.Inspect with block hash validation +// for a randomly generated CARv2 file of size 10 MiB. +func BenchmarkReader_InspectWithBlockValidation(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + benchmarkInspect(b, path, true) + } + }) +} + +// BenchmarkReader_InspectWithoutBlockValidation benchmarks Reader.Inspect without block hash +// validation for a randomly generated CARv2 file of size 10 MiB. +func BenchmarkReader_InspectWithoutBlockValidation(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + benchmarkInspect(b, path, false) + } + }) +} + +func benchmarkInspect(b *testing.B, path string, validateBlockHash bool) { + reader, err := carv2.OpenReader(path) + if err != nil { + b.Fatal(err) + } + if _, err := reader.Inspect(validateBlockHash); err != nil { + b.Fatal(err) + } +} func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) { // Use fixed RNG for determinism across benchmarks. rng := rand.New(rand.NewSource(1413)) diff --git a/v2/block_reader.go b/v2/block_reader.go index 456d8d31..252885c3 100644 --- a/v2/block_reader.go +++ b/v2/block_reader.go @@ -30,9 +30,11 @@ type BlockReader struct { // // See BlockReader.Next func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { + options := ApplyOptions(opts...) + // Read CARv1 header or CARv2 pragma. // Both are a valid CARv1 header, therefore are read as such. - pragmaOrV1Header, err := carv1.ReadHeader(r) + pragmaOrV1Header, err := carv1.ReadHeader(r, options.MaxAllowedHeaderSize) if err != nil { return nil, err } @@ -40,7 +42,7 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { // Populate the block reader version and options. br := &BlockReader{ Version: pragmaOrV1Header.Version, - opts: ApplyOptions(opts...), + opts: options, } // Expect either version 1 or 2. @@ -62,20 +64,6 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { if _, err := v2h.ReadFrom(r); err != nil { return nil, err } - // Assert the data payload offset validity. - // It must be at least 51 ( + ). - dataOffset := int64(v2h.DataOffset) - if dataOffset < PragmaSize+HeaderSize { - return nil, fmt.Errorf("invalid data payload offset: %v", dataOffset) - } - // Assert the data size validity. - // It must be larger than zero. - // Technically, it should be at least 11 bytes (i.e. a valid CARv1 header with no roots) but - // we let further parsing of the header to signal invalid data payload header. - dataSize := int64(v2h.DataSize) - if dataSize <= 0 { - return nil, fmt.Errorf("invalid data payload size: %v", dataSize) - } // Skip to the beginning of inner CARv1 data payload. // Note, at this point the pragma and CARv1 header have been read. @@ -84,15 +72,15 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { // fast forward to the beginning of data payload by subtracting pragma and header size from // dataOffset. rs := internalio.ToByteReadSeeker(r) - if _, err := rs.Seek(dataOffset-PragmaSize-HeaderSize, io.SeekCurrent); err != nil { + if _, err := rs.Seek(int64(v2h.DataOffset)-PragmaSize-HeaderSize, io.SeekCurrent); err != nil { return nil, err } // Set br.r to a LimitReader reading from r limited to dataSize. - br.r = io.LimitReader(r, dataSize) + br.r = io.LimitReader(r, int64(v2h.DataSize)) // Populate br.Roots by reading the inner CARv1 data payload header. - header, err := carv1.ReadHeader(br.r) + header, err := carv1.ReadHeader(br.r, options.MaxAllowedHeaderSize) if err != nil { return nil, err } @@ -120,7 +108,7 @@ func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { // immediately upon encountering a zero-length section without reading any further bytes from the // underlying io.Reader. func (br *BlockReader) Next() (blocks.Block, error) { - c, data, err := util.ReadNode(br.r, br.opts.ZeroLengthSectionAsEOF) + c, data, err := util.ReadNode(br.r, br.opts.ZeroLengthSectionAsEOF, br.opts.MaxAllowedSectionSize) if err != nil { return nil, err } @@ -131,7 +119,7 @@ func (br *BlockReader) Next() (blocks.Block, error) { } if !hashed.Equals(c) { - return nil, fmt.Errorf("mismatch in content integrity, name: %s, data: %s", c, hashed) + return nil, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, hashed) } return blocks.NewBlockWithCid(data, c) diff --git a/v2/block_reader_test.go b/v2/block_reader_test.go index afffc806..384a6ed8 100644 --- a/v2/block_reader_test.go +++ b/v2/block_reader_test.go @@ -1,12 +1,17 @@ package car_test import ( + "bytes" + "encoding/hex" "io" "os" "testing" + "github.com/ipfs/go-cid" carv2 "github.com/ipld/go-car/v2" "github.com/ipld/go-car/v2/internal/carv1" + mh "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" "github.com/stretchr/testify/require" ) @@ -104,6 +109,56 @@ func TestBlockReader_WithCarV1Consistency(t *testing.T) { } } +func TestMaxSectionLength(t *testing.T) { + // headerHex is the zero-roots CARv1 header + const headerHex = "11a265726f6f7473806776657273696f6e01" + headerBytes, _ := hex.DecodeString(headerHex) + // 8 MiB block of zeros + block := make([]byte, 8<<20) + // CID for that block + pfx := cid.NewPrefixV1(cid.Raw, mh.SHA2_256) + cid, err := pfx.Sum(block) + require.NoError(t, err) + + // construct CAR + var buf bytes.Buffer + buf.Write(headerBytes) + buf.Write(varint.ToUvarint(uint64(len(cid.Bytes()) + len(block)))) + buf.Write(cid.Bytes()) + buf.Write(block) + + // try to read it + car, err := carv2.NewBlockReader(bytes.NewReader(buf.Bytes())) + require.NoError(t, err) + // error should occur on first section read + _, err = car.Next() + require.EqualError(t, err, "invalid section data, length of read beyond allowable maximum") + + // successful read by expanding the max section size + car, err = carv2.NewBlockReader(bytes.NewReader(buf.Bytes()), carv2.MaxAllowedSectionSize((8<<20)+40)) + require.NoError(t, err) + // can now read block and get our 8 MiB zeroed byte array + readBlock, err := car.Next() + require.NoError(t, err) + require.True(t, bytes.Equal(block, readBlock.RawData())) +} + +func TestMaxHeaderLength(t *testing.T) { + // headerHex is the is a 5 root CARv1 header + const headerHex = "de01a265726f6f747385d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b5016776657273696f6e01" + headerBytes, _ := hex.DecodeString(headerHex) + c, _ := cid.Decode("bafyreidykglsfhoixmivffc5uwhcgshx4j465xwqntbmu43nb2dzqwfvae") + + // successful read + car, err := carv2.NewBlockReader(bytes.NewReader(headerBytes)) + require.NoError(t, err) + require.ElementsMatch(t, []cid.Cid{c, c, c, c, c}, car.Roots) + + // unsuccessful read, low allowable max header length (length - 3 because there are 2 bytes in the length varint prefix) + _, err = carv2.NewBlockReader(bytes.NewReader(headerBytes), carv2.MaxAllowedHeaderSize(uint64(len(headerBytes)-3))) + require.EqualError(t, err, "invalid header data, length of read beyond allowable maximum") +} + func requireReaderFromPath(t *testing.T, path string) io.Reader { f, err := os.Open(path) require.NoError(t, err) diff --git a/v2/blockstore/insertionindex.go b/v2/blockstore/insertionindex.go index e8575ee1..1e480b3f 100644 --- a/v2/blockstore/insertionindex.go +++ b/v2/blockstore/insertionindex.go @@ -15,6 +15,10 @@ import ( cbor "github.com/whyrusleeping/cbor/go" ) +// This index is intended to be efficient for random-access, in-memory lookups +// and is not intended to be an index type that is attached to a CARv2. +// See flatten() for conversion of this data to a known, existing index type. + var ( errUnsupported = errors.New("not supported") insertionIndexCodec = multicodec.Code(0x300003) @@ -137,6 +141,20 @@ func (ii *insertionIndex) Unmarshal(r io.Reader) error { return nil } +func (ii *insertionIndex) ForEach(f func(multihash.Multihash, uint64) error) error { + var errr error + ii.items.AscendGreaterOrEqual(ii.items.Min(), func(i llrb.Item) bool { + r := i.(recordDigest).Record + err := f(r.Cid.Hash(), r.Offset) + if err != nil { + errr = err + return false + } + return true + }) + return errr +} + func (ii *insertionIndex) Codec() multicodec.Code { return insertionIndexCodec } diff --git a/v2/blockstore/readonly.go b/v2/blockstore/readonly.go index f0a15e78..32c49046 100644 --- a/v2/blockstore/readonly.go +++ b/v2/blockstore/readonly.go @@ -47,6 +47,7 @@ type ReadOnly struct { // The backing containing the data payload in CARv1 format. backing io.ReaderAt + // The CARv1 content index. idx index.Index @@ -98,7 +99,7 @@ func NewReadOnly(backing io.ReaderAt, idx index.Index, opts ...carv2.Option) (*R opts: carv2.ApplyOptions(opts...), } - version, err := readVersion(backing) + version, err := readVersion(backing, opts...) if err != nil { return nil, err } @@ -119,15 +120,28 @@ func NewReadOnly(backing io.ReaderAt, idx index.Index, opts ...carv2.Option) (*R } if idx == nil { if v2r.Header.HasIndex() { - idx, err = index.ReadFrom(v2r.IndexReader()) + ir, err := v2r.IndexReader() if err != nil { return nil, err } - } else if idx, err = generateIndex(v2r.DataReader(), opts...); err != nil { - return nil, err + idx, err = index.ReadFrom(ir) + if err != nil { + return nil, err + } + } else { + dr, err := v2r.DataReader() + if err != nil { + return nil, err + } + if idx, err = generateIndex(dr, opts...); err != nil { + return nil, err + } } } - b.backing = v2r.DataReader() + b.backing, err = v2r.DataReader() + if err != nil { + return nil, err + } b.idx = idx return b, nil default: @@ -135,15 +149,19 @@ func NewReadOnly(backing io.ReaderAt, idx index.Index, opts ...carv2.Option) (*R } } -func readVersion(at io.ReaderAt) (uint64, error) { +func readVersion(at io.ReaderAt, opts ...carv2.Option) (uint64, error) { var rr io.Reader switch r := at.(type) { case io.Reader: rr = r default: - rr = internalio.NewOffsetReadSeeker(r, 0) + var err error + rr, err = internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return 0, err + } } - return carv2.ReadVersion(rr) + return carv2.ReadVersion(rr, opts...) } func generateIndex(at io.ReaderAt, opts ...carv2.Option) (index.Index, error) { @@ -156,7 +174,11 @@ func generateIndex(at io.ReaderAt, opts ...carv2.Option) (index.Index, error) { return nil, err } default: - rs = internalio.NewOffsetReadSeeker(r, 0) + var err error + rs, err = internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return nil, err + } } // Note, we do not set any write options so that all write options fall back onto defaults. @@ -182,8 +204,11 @@ func OpenReadOnly(path string, opts ...carv2.Option) (*ReadOnly, error) { } func (b *ReadOnly) readBlock(idx int64) (cid.Cid, []byte, error) { - bcid, data, err := util.ReadNode(internalio.NewOffsetReadSeeker(b.backing, idx), b.opts.ZeroLengthSectionAsEOF) - return bcid, data, err + r, err := internalio.NewOffsetReadSeeker(b.backing, idx) + if err != nil { + return cid.Cid{}, nil, err + } + return util.ReadNode(r, b.opts.ZeroLengthSectionAsEOF, b.opts.MaxAllowedSectionSize) } // DeleteBlock is unsupported and always errors. @@ -212,8 +237,11 @@ func (b *ReadOnly) Has(ctx context.Context, key cid.Cid) (bool, error) { var fnFound bool var fnErr error err := b.idx.GetAll(key, func(offset uint64) bool { - uar := internalio.NewOffsetReadSeeker(b.backing, int64(offset)) - var err error + uar, err := internalio.NewOffsetReadSeeker(b.backing, int64(offset)) + if err != nil { + fnErr = err + return false + } _, err = varint.ReadUvarint(uar) if err != nil { fnErr = err @@ -313,7 +341,11 @@ func (b *ReadOnly) GetSize(ctx context.Context, key cid.Cid) (int, error) { fnSize := -1 var fnErr error err := b.idx.GetAll(key, func(offset uint64) bool { - rdr := internalio.NewOffsetReadSeeker(b.backing, int64(offset)) + rdr, err := internalio.NewOffsetReadSeeker(b.backing, int64(offset)) + if err != nil { + fnErr = err + return false + } sectionLen, err := varint.ReadUvarint(rdr) if err != nil { fnErr = err @@ -397,8 +429,11 @@ func (b *ReadOnly) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) { } // TODO we may use this walk for populating the index, and we need to be able to iterate keys in this way somewhere for index generation. In general though, when it's asked for all keys from a blockstore with an index, we should iterate through the index when possible rather than linear reads through the full car. - rdr := internalio.NewOffsetReadSeeker(b.backing, 0) - header, err := carv1.ReadHeader(rdr) + rdr, err := internalio.NewOffsetReadSeeker(b.backing, 0) + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(rdr, b.opts.MaxAllowedHeaderSize) if err != nil { b.mu.RUnlock() // don't hold the mutex forever return nil, fmt.Errorf("error reading car header: %w", err) @@ -441,7 +476,11 @@ func (b *ReadOnly) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) { } } - thisItemForNxt := rdr.Offset() + thisItemForNxt, err := rdr.Seek(0, io.SeekCurrent) + if err != nil { + maybeReportError(ctx, err) + return + } _, c, err := cid.CidFromReader(rdr) if err != nil { maybeReportError(ctx, err) @@ -484,7 +523,11 @@ func (b *ReadOnly) HashOnRead(bool) { // Roots returns the root CIDs of the backing CAR. func (b *ReadOnly) Roots() ([]cid.Cid, error) { - header, err := carv1.ReadHeader(internalio.NewOffsetReadSeeker(b.backing, 0)) + ors, err := internalio.NewOffsetReadSeeker(b.backing, 0) + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(ors, b.opts.MaxAllowedHeaderSize) if err != nil { return nil, fmt.Errorf("error reading car header: %w", err) } diff --git a/v2/blockstore/readonly_test.go b/v2/blockstore/readonly_test.go index f55a1e50..5a947a75 100644 --- a/v2/blockstore/readonly_test.go +++ b/v2/blockstore/readonly_test.go @@ -226,7 +226,9 @@ func newV1ReaderFromV2File(t *testing.T, carv2Path string, zeroLenSectionAsEOF b t.Cleanup(func() { f.Close() }) v2r, err := carv2.NewReader(f) require.NoError(t, err) - v1r, err := newV1Reader(v2r.DataReader(), zeroLenSectionAsEOF) + dr, err := v2r.DataReader() + require.NoError(t, err) + v1r, err := newV1Reader(dr, zeroLenSectionAsEOF) require.NoError(t, err) return v1r } diff --git a/v2/blockstore/readwrite.go b/v2/blockstore/readwrite.go index 090633c0..774f37ff 100644 --- a/v2/blockstore/readwrite.go +++ b/v2/blockstore/readwrite.go @@ -86,12 +86,12 @@ func AllowDuplicatePuts(allow bool) carv2.Option { // successfully. On resumption the roots argument and WithDataPadding option must match the // previous instantiation of ReadWrite blockstore that created the file. More explicitly, the file // resuming from must: -// 1. start with a complete CARv2 car.Pragma. -// 2. contain a complete CARv1 data header with root CIDs matching the CIDs passed to the -// constructor, starting at offset optionally padded by WithDataPadding, followed by zero or -// more complete data sections. If any corrupt data sections are present the resumption will fail. -// Note, if set previously, the blockstore must use the same WithDataPadding option as before, -// since this option is used to locate the CARv1 data payload. +// 1. start with a complete CARv2 car.Pragma. +// 2. contain a complete CARv1 data header with root CIDs matching the CIDs passed to the +// constructor, starting at offset optionally padded by WithDataPadding, followed by zero or +// more complete data sections. If any corrupt data sections are present the resumption will fail. +// Note, if set previously, the blockstore must use the same WithDataPadding option as before, +// since this option is used to locate the CARv1 data payload. // // Note, resumption should be used with WithCidDeduplication, so that blocks that are successfully // written into the file are not re-written. Unless, the user explicitly wants duplicate blocks. @@ -139,7 +139,10 @@ func OpenReadWrite(path string, roots []cid.Cid, opts ...carv2.Option) (*ReadWri offset = 0 } rwbs.dataWriter = internalio.NewOffsetWriter(rwbs.f, offset) - v1r := internalio.NewOffsetReadSeeker(rwbs.f, offset) + v1r, err := internalio.NewOffsetReadSeeker(rwbs.f, offset) + if err != nil { + return nil, err + } rwbs.ronly.backing = v1r rwbs.ronly.idx = rwbs.idx rwbs.ronly.carv2Closer = rwbs.f @@ -190,7 +193,11 @@ func (b *ReadWrite) resumeWithRoots(v2 bool, roots []cid.Cid) error { // Check if file was finalized by trying to read the CARv2 header. // We check because if finalized the CARv1 reader behaviour needs to be adjusted since // EOF will not signify end of CARv1 payload. i.e. index is most likely present. - _, err = headerInFile.ReadFrom(internalio.NewOffsetReadSeeker(b.f, carv2.PragmaSize)) + r, err := internalio.NewOffsetReadSeeker(b.f, carv2.PragmaSize) + if err != nil { + return err + } + _, err = headerInFile.ReadFrom(r) // If reading CARv2 header succeeded, and CARv1 offset in header is not zero then the file is // most-likely finalized. Check padding and truncate the file to remove index. @@ -216,8 +223,11 @@ func (b *ReadWrite) resumeWithRoots(v2 bool, roots []cid.Cid) error { } // Use the given CARv1 padding to instantiate the CARv1 reader on file. - v1r := internalio.NewOffsetReadSeeker(b.ronly.backing, 0) - header, err := carv1.ReadHeader(v1r) + v1r, err := internalio.NewOffsetReadSeeker(b.ronly.backing, 0) + if err != nil { + return err + } + header, err := carv1.ReadHeader(v1r, b.opts.MaxAllowedHeaderSize) if err != nil { // Cannot read the CARv1 header; the file is most likely corrupt. return fmt.Errorf("error reading car header: %w", err) diff --git a/v2/blockstore/readwrite_test.go b/v2/blockstore/readwrite_test.go index 86ab06ea..11cc99a2 100644 --- a/v2/blockstore/readwrite_test.go +++ b/v2/blockstore/readwrite_test.go @@ -487,7 +487,9 @@ func TestBlockstoreResumption(t *testing.T) { wantPayloadReader, err := carv1.NewCarReader(v1f) require.NoError(t, err) - gotPayloadReader, err := carv1.NewCarReader(v2r.DataReader()) + dr, err := v2r.DataReader() + require.NoError(t, err) + gotPayloadReader, err := carv1.NewCarReader(dr) require.NoError(t, err) require.Equal(t, wantPayloadReader.Header, gotPayloadReader.Header) @@ -515,9 +517,13 @@ func TestBlockstoreResumption(t *testing.T) { // Assert index in resumed from file is identical to index generated from the data payload portion of the generated CARv2 file. _, err = v1f.Seek(0, io.SeekStart) require.NoError(t, err) - gotIdx, err := index.ReadFrom(v2r.IndexReader()) + ir, err := v2r.IndexReader() + require.NoError(t, err) + gotIdx, err := index.ReadFrom(ir) + require.NoError(t, err) + dr, err = v2r.DataReader() require.NoError(t, err) - wantIdx, err := carv2.GenerateIndex(v2r.DataReader()) + wantIdx, err := carv2.GenerateIndex(dr) require.NoError(t, err) require.Equal(t, wantIdx, gotIdx) } @@ -828,14 +834,17 @@ func TestOpenReadWrite_WritesIdentityCIDsWhenOptionIsEnabled(t *testing.T) { t.Cleanup(func() { require.NoError(t, r.Close()) }) require.True(t, r.Header.HasIndex()) - ir := r.IndexReader() + ir, err := r.IndexReader() + require.NoError(t, err) require.NotNil(t, ir) gotIdx, err := index.ReadFrom(ir) require.NoError(t, err) // Determine expected offset as the length of header plus one - header, err := carv1.ReadHeader(r.DataReader()) + dr, err := r.DataReader() + require.NoError(t, err) + header, err := carv1.ReadHeader(dr, carv1.DefaultMaxAllowedHeaderSize) require.NoError(t, err) object, err := cbor.DumpObject(header) require.NoError(t, err) @@ -918,7 +927,9 @@ func TestReadWrite_ReWritingCARv1WithIdentityCidIsIdenticalToOriginalWithOptions // Note, we hash instead of comparing bytes to avoid excessive memory usage when sample CARv1 is large. hasher := sha512.New() - gotWritten, err := io.Copy(hasher, v2r.DataReader()) + dr, err := v2r.DataReader() + require.NoError(t, err) + gotWritten, err := io.Copy(hasher, dr) require.NoError(t, err) gotSum := hasher.Sum(nil) diff --git a/v2/car.go b/v2/car.go index f2885d9d..571eb114 100644 --- a/v2/car.go +++ b/v2/car.go @@ -2,6 +2,7 @@ package car import ( "encoding/binary" + "fmt" "io" ) @@ -166,8 +167,27 @@ func (h *Header) ReadFrom(r io.Reader) (int64, error) { if err != nil { return n, err } - h.DataOffset = binary.LittleEndian.Uint64(buf[:8]) - h.DataSize = binary.LittleEndian.Uint64(buf[8:16]) - h.IndexOffset = binary.LittleEndian.Uint64(buf[16:]) + dataOffset := binary.LittleEndian.Uint64(buf[:8]) + dataSize := binary.LittleEndian.Uint64(buf[8:16]) + indexOffset := binary.LittleEndian.Uint64(buf[16:]) + // Assert the data payload offset validity. + // It must be at least 51 ( + ). + if int64(dataOffset) < PragmaSize+HeaderSize { + return n, fmt.Errorf("invalid data payload offset: %v", dataOffset) + } + // Assert the data size validity. + // It must be larger than zero. + // Technically, it should be at least 11 bytes (i.e. a valid CARv1 header with no roots) but + // we let further parsing of the header to signal invalid data payload header. + if int64(dataSize) <= 0 { + return n, fmt.Errorf("invalid data payload size: %v", dataSize) + } + // Assert the index offset validity. + if int64(indexOffset) < 0 { + return n, fmt.Errorf("invalid index offset: %v", indexOffset) + } + h.DataOffset = dataOffset + h.DataSize = dataSize + h.IndexOffset = indexOffset return n, nil } diff --git a/v2/car_test.go b/v2/car_test.go index 64519b29..d993a374 100644 --- a/v2/car_test.go +++ b/v2/car_test.go @@ -37,7 +37,7 @@ func TestCarV2PragmaLength(t *testing.T) { } func TestCarV2PragmaIsValidCarV1Header(t *testing.T) { - v1h, err := carv1.ReadHeader(bytes.NewReader(carv2.Pragma)) + v1h, err := carv1.ReadHeader(bytes.NewReader(carv2.Pragma), carv1.DefaultMaxAllowedHeaderSize) assert.NoError(t, err, "cannot decode pragma as CBOR with CARv1 header structure") assert.Equal(t, &carv1.CarHeader{ Roots: nil, @@ -56,11 +56,12 @@ func TestHeader_WriteTo(t *testing.T) { "HeaderWithEmptyCharacteristicsIsWrittenAsExpected", carv2.Header{ Characteristics: carv2.Characteristics{}, + DataOffset: 99, }, []byte{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, @@ -114,12 +115,14 @@ func TestHeader_ReadFrom(t *testing.T) { []byte{ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x64, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, carv2.Header{ Characteristics: carv2.Characteristics{}, + DataOffset: 99, + DataSize: 100, }, false, }, diff --git a/v2/example_test.go b/v2/example_test.go index 53dfa349..57378aea 100644 --- a/v2/example_test.go +++ b/v2/example_test.go @@ -51,7 +51,11 @@ func ExampleWrapV1File() { if err != nil { panic(err) } - inner, err := ioutil.ReadAll(cr.DataReader()) + dr, err := cr.DataReader() + if err != nil { + panic(err) + } + inner, err := ioutil.ReadAll(dr) if err != nil { panic(err) } diff --git a/v2/fuzz_test.go b/v2/fuzz_test.go new file mode 100644 index 00000000..c45d83cb --- /dev/null +++ b/v2/fuzz_test.go @@ -0,0 +1,142 @@ +//go:build go1.18 +// +build go1.18 + +package car_test + +import ( + "bytes" + "encoding/hex" + "io" + "os" + "path/filepath" + "testing" + + car "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/internal/carv1" +) + +// v1FixtureStr is a clean carv1 single-block, single-root CAR +const v1FixtureStr = "3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5" + +func seedWithCarFiles(f *testing.F) { + fixture, err := hex.DecodeString(v1FixtureStr) + if err != nil { + f.Fatal(err) + } + f.Add(fixture) + files, err := filepath.Glob("testdata/*.car") + if err != nil { + f.Fatal(err) + } + for _, fname := range files { + func() { + file, err := os.Open(fname) + if err != nil { + f.Fatal(err) + } + defer file.Close() + data, err := io.ReadAll(file) + if err != nil { + f.Fatal(err) + } + f.Add(data) + }() + } +} + +func FuzzBlockReader(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + r, err := car.NewBlockReader(bytes.NewReader(data)) + if err != nil { + return + } + + for { + _, err = r.Next() + if err == io.EOF { + return + } + } + }) +} + +func FuzzReader(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + subject, err := car.NewReader(bytes.NewReader(data)) + if err != nil { + return + } + + subject.Roots() + _, err = subject.IndexReader() + if err != nil { + return + } + dr, err := subject.DataReader() + if err != nil { + return + } + car.GenerateIndex(dr) + }) +} + +func FuzzInspect(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + reader, err := car.NewReader(bytes.NewReader(data)) + if err != nil { + return + } + + // Do differential fuzzing between Inspect and the normal parser + _, inspectErr := reader.Inspect(true) + if inspectErr == nil { + return + } + + reader, err = car.NewReader(bytes.NewReader(data)) + if err != nil { + t.Fatal("second NewReader on same data failed", err.Error()) + } + i, err := reader.IndexReader() + if err != nil { + return + } + // FIXME: Once indexes are safe to parse, do not skip .car with index in the differential fuzzing. + if i == nil { + return + } + dr, err := reader.DataReader() + if err != nil { + return + } + + _, err = carv1.ReadHeader(dr, carv1.DefaultMaxAllowedHeaderSize) + if err != nil { + return + } + + blocks, err := car.NewBlockReader(dr) + if err != nil { + return + } + + for { + _, err := blocks.Next() + if err != nil { + if err == io.EOF { + break + } + // caught error as expected + return + } + } + + t.Fatal("Inspect found error but we red this file correctly:", inspectErr.Error()) + }) +} diff --git a/v2/index/example_test.go b/v2/index/example_test.go index 3e484afb..d2a9da54 100644 --- a/v2/index/example_test.go +++ b/v2/index/example_test.go @@ -28,7 +28,11 @@ func ExampleReadFrom() { } // Read and unmarshall index within CARv2 file. - idx, err := index.ReadFrom(cr.IndexReader()) + ir, err := cr.IndexReader() + if err != nil { + panic(err) + } + idx, err := index.ReadFrom(ir) if err != nil { panic(err) } @@ -62,7 +66,11 @@ func ExampleWriteTo() { }() // Read and unmarshall index within CARv2 file. - idx, err := index.ReadFrom(cr.IndexReader()) + ir, err := cr.IndexReader() + if err != nil { + panic(err) + } + idx, err := index.ReadFrom(ir) if err != nil { panic(err) } diff --git a/v2/index/index.go b/v2/index/index.go index 10195b43..911eaa7a 100644 --- a/v2/index/index.go +++ b/v2/index/index.go @@ -5,14 +5,12 @@ import ( "fmt" "io" + "github.com/ipfs/go-cid" internalio "github.com/ipld/go-car/v2/internal/io" "github.com/multiformats/go-multicodec" "github.com/multiformats/go-multihash" - "github.com/multiformats/go-varint" - - "github.com/ipfs/go-cid" ) // CarIndexNone is a sentinal value used as a multicodec code for the index indicating no index. @@ -44,7 +42,12 @@ type ( // Marshal encodes the index in serial form. Marshal(w io.Writer) (uint64, error) + // Unmarshal decodes the index from its serial form. + // Note, this function will copy the entire index into memory. + // + // Do not unmarshal index from untrusted CARv2 files. Instead the index should be + // regenerated from the CARv2 data payload. Unmarshal(r io.Reader) error // Load inserts a number of records into the index. @@ -68,14 +71,7 @@ type ( GetAll(cid.Cid, func(uint64) bool) error } - // IterableIndex extends Index in cases where the Index is able to - // provide an iterator for getting the list of all multihashes in the - // index. - // - // Note that it is possible for an index to contain multiple offsets for - // a given multihash. - // - // See: IterableIndex.ForEach, Index.GetAll. + // IterableIndex is an index which support iterating over it's elements IterableIndex interface { Index @@ -136,12 +132,14 @@ func WriteTo(idx Index, w io.Writer) (uint64, error) { // ReadFrom reads index from r. // The reader decodes the index by reading the first byte to interpret the encoding. // Returns error if the encoding is not known. +// +// Attempting to read index data from untrusted sources is not recommended. +// Instead the index should be regenerated from the CARv2 data payload. func ReadFrom(r io.Reader) (Index, error) { - code, err := varint.ReadUvarint(internalio.ToByteReader(r)) + codec, err := ReadCodec(r) if err != nil { return nil, err } - codec := multicodec.Code(code) idx, err := New(codec) if err != nil { return nil, err @@ -151,3 +149,12 @@ func ReadFrom(r io.Reader) (Index, error) { } return idx, nil } + +// ReadCodec reads the codec of the index by decoding the first varint read from r. +func ReadCodec(r io.Reader) (multicodec.Code, error) { + code, err := varint.ReadUvarint(internalio.ToByteReader(r)) + if err != nil { + return 0, err + } + return multicodec.Code(code), nil +} diff --git a/v2/index/index_test.go b/v2/index/index_test.go index 267beb03..972b4c66 100644 --- a/v2/index/index_test.go +++ b/v2/index/index_test.go @@ -54,6 +54,16 @@ func TestReadFrom(t *testing.T) { subject, err := ReadFrom(idxf) require.NoError(t, err) + _, err = idxf.Seek(0, io.SeekStart) + require.NoError(t, err) + + idxf2, err := os.Open("../testdata/sample-multihash-index-sorted.carindex") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf2.Close()) }) + + subjectInAltFormat, err := ReadFrom(idxf) + require.NoError(t, err) + crf, err := os.Open("../testdata/sample-v1.car") require.NoError(t, err) t.Cleanup(func() { require.NoError(t, crf.Close()) }) @@ -67,17 +77,23 @@ func TestReadFrom(t *testing.T) { } require.NoError(t, err) + wantCid := wantBlock.Cid() // Get offset from the index for a CID and assert it exists - gotOffset, err := GetFirst(subject, wantBlock.Cid()) + gotOffset, err := GetFirst(subject, wantCid) require.NoError(t, err) require.NotZero(t, gotOffset) + // Get offset from the index in alternative format for a CID and assert it exists + gotOffset2, err := GetFirst(subjectInAltFormat, wantCid) + require.NoError(t, err) + require.NotZero(t, gotOffset2) + // Seek to the offset on CARv1 file _, err = crf.Seek(int64(gotOffset), io.SeekStart) require.NoError(t, err) // Read the fame at offset and assert the frame corresponds to the expected block. - gotCid, gotData, err := util.ReadNode(crf, false) + gotCid, gotData, err := util.ReadNode(crf, false, carv1.DefaultMaxAllowedSectionSize) require.NoError(t, err) gotBlock, err := blocks.NewBlockWithCid(gotData, gotCid) require.NoError(t, err) @@ -87,7 +103,7 @@ func TestReadFrom(t *testing.T) { func TestWriteTo(t *testing.T) { // Read sample index on file - idxf, err := os.Open("../testdata/sample-index.carindex") + idxf, err := os.Open("../testdata/sample-multihash-index-sorted.carindex") require.NoError(t, err) t.Cleanup(func() { require.NoError(t, idxf.Close()) }) @@ -116,18 +132,37 @@ func TestWriteTo(t *testing.T) { } func TestMarshalledIndexStartsWithCodec(t *testing.T) { - // Read sample index on file - idxf, err := os.Open("../testdata/sample-index.carindex") - require.NoError(t, err) - t.Cleanup(func() { require.NoError(t, idxf.Close()) }) - - // Unmarshall to get expected index - wantIdx, err := ReadFrom(idxf) - require.NoError(t, err) - // Assert the first two bytes are the corresponding multicodec code. - buf := new(bytes.Buffer) - _, err = WriteTo(wantIdx, buf) - require.NoError(t, err) - require.Equal(t, varint.ToUvarint(uint64(multicodec.CarIndexSorted)), buf.Bytes()[:2]) + tests := []struct { + path string + codec multicodec.Code + }{ + { + path: "../testdata/sample-multihash-index-sorted.carindex", + codec: multicodec.CarMultihashIndexSorted, + }, + { + path: "../testdata/sample-index.carindex", + codec: multicodec.CarIndexSorted, + }, + } + for _, test := range tests { + test := test + t.Run(test.codec.String(), func(t *testing.T) { + // Read sample index on file + idxf, err := os.Open(test.path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf.Close()) }) + + // Unmarshall to get expected index + wantIdx, err := ReadFrom(idxf) + require.NoError(t, err) + + // Assert the first two bytes are the corresponding multicodec code. + buf := new(bytes.Buffer) + _, err = WriteTo(wantIdx, buf) + require.NoError(t, err) + require.Equal(t, varint.ToUvarint(uint64(test.codec)), buf.Bytes()[:2]) + }) + } } diff --git a/v2/index/indexsorted.go b/v2/index/indexsorted.go index 2c05a922..aeed4c11 100644 --- a/v2/index/indexsorted.go +++ b/v2/index/indexsorted.go @@ -3,7 +3,9 @@ package index import ( "bytes" "encoding/binary" + "errors" "fmt" + internalio "github.com/ipld/go-car/v2/internal/io" "io" "sort" @@ -61,16 +63,51 @@ func (s *singleWidthIndex) Marshal(w io.Writer) (uint64, error) { } func (s *singleWidthIndex) Unmarshal(r io.Reader) error { - if err := binary.Read(r, binary.LittleEndian, &s.width); err != nil { + var width uint32 + if err := binary.Read(r, binary.LittleEndian, &width); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } return err } - if err := binary.Read(r, binary.LittleEndian, &s.len); err != nil { + var dataLen uint64 + if err := binary.Read(r, binary.LittleEndian, &dataLen); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } return err } - s.index = make([]byte, s.len) - s.len /= uint64(s.width) - _, err := io.ReadFull(r, s.index) - return err + + if err := s.checkUnmarshalLengths(width, dataLen, 0); err != nil { + return err + } + + buf := make([]byte, dataLen) + if _, err := io.ReadFull(r, buf); err != nil { + return err + } + s.index = buf + return nil +} + +func (s *singleWidthIndex) checkUnmarshalLengths(width uint32, dataLen, extra uint64) error { + if width <= 8 { + return errors.New("malformed index; width must be bigger than 8") + } + const maxWidth = 32 << 20 // 32MiB, to ~match the go-cid maximum + if width > maxWidth { + return errors.New("index too big; singleWidthIndex width is larger than allowed maximum") + } + oldDataLen, dataLen := dataLen, dataLen+extra + if oldDataLen > dataLen { + return errors.New("index too big; singleWidthIndex len is overflowing") + } + if int64(dataLen) < 0 { + return errors.New("index too big; singleWidthIndex len is overflowing int64") + } + s.width = width + s.len = dataLen / uint64(width) + return nil } func (s *singleWidthIndex) Less(i int, digest []byte) bool { @@ -190,15 +227,35 @@ func (m *multiWidthIndex) Marshal(w io.Writer) (uint64, error) { } func (m *multiWidthIndex) Unmarshal(r io.Reader) error { + reader := internalio.ToByteReadSeeker(r) var l int32 - if err := binary.Read(r, binary.LittleEndian, &l); err != nil { + if err := binary.Read(reader, binary.LittleEndian, &l); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + sum, err := reader.Seek(0, io.SeekCurrent) + if err != nil { return err } + if int32(l) < 0 { + return errors.New("index too big; multiWidthIndex count is overflowing int32") + } for i := 0; i < int(l); i++ { s := singleWidthIndex{} if err := s.Unmarshal(r); err != nil { return err } + n, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + oldSum := sum + sum += n + if sum < oldSum { + return errors.New("index too big; multiWidthIndex len is overflowing int64") + } (*m)[s.width] = s } return nil diff --git a/v2/index/mhindexsorted.go b/v2/index/mhindexsorted.go index 55975b8e..e0ef675d 100644 --- a/v2/index/mhindexsorted.go +++ b/v2/index/mhindexsorted.go @@ -2,6 +2,8 @@ package index import ( "encoding/binary" + "errors" + internalio "github.com/ipld/go-car/v2/internal/io" "io" "sort" @@ -41,6 +43,9 @@ func (m *multiWidthCodedIndex) Marshal(w io.Writer) (uint64, error) { func (m *multiWidthCodedIndex) Unmarshal(r io.Reader) error { if err := binary.Read(r, binary.LittleEndian, &m.code); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } return err } return m.multiWidthIndex.Unmarshal(r) @@ -93,15 +98,35 @@ func (m *MultihashIndexSorted) sortedMultihashCodes() []uint64 { } func (m *MultihashIndexSorted) Unmarshal(r io.Reader) error { + reader := internalio.ToByteReadSeeker(r) var l int32 - if err := binary.Read(r, binary.LittleEndian, &l); err != nil { + if err := binary.Read(reader, binary.LittleEndian, &l); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + sum, err := reader.Seek(0, io.SeekCurrent) + if err != nil { return err } + if int32(l) < 0 { + return errors.New("index too big; MultihashIndexSorted count is overflowing int32") + } for i := 0; i < int(l); i++ { mwci := newMultiWidthCodedIndex() if err := mwci.Unmarshal(r); err != nil { return err } + n, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + oldSum := sum + sum += n + if sum < oldSum { + return errors.New("index too big; MultihashIndexSorted len is overflowing int64") + } m.put(mwci) } return nil diff --git a/v2/index/mhindexsorted_test.go b/v2/index/mhindexsorted_test.go index 79fc9c5f..520157e4 100644 --- a/v2/index/mhindexsorted_test.go +++ b/v2/index/mhindexsorted_test.go @@ -53,21 +53,23 @@ func TestMultiWidthCodedIndex_StableIterate(t *testing.T) { records = append(records, generateIndexRecords(t, multihash.IDENTITY, rng)...) // Create a new mh sorted index and load randomly generated records into it. - subject, err := index.New(multicodec.CarMultihashIndexSorted) + idx, err := index.New(multicodec.CarMultihashIndexSorted) require.NoError(t, err) - err = subject.Load(records) + err = idx.Load(records) require.NoError(t, err) - iterable := subject.(index.IterableIndex) + subject, ok := idx.(index.IterableIndex) + require.True(t, ok) + mh := make([]multihash.Multihash, 0, len(records)) - require.NoError(t, iterable.ForEach(func(m multihash.Multihash, _ uint64) error { + require.NoError(t, subject.ForEach(func(m multihash.Multihash, _ uint64) error { mh = append(mh, m) return nil })) for i := 0; i < 10; i++ { candidate := make([]multihash.Multihash, 0, len(records)) - require.NoError(t, iterable.ForEach(func(m multihash.Multihash, _ uint64) error { + require.NoError(t, subject.ForEach(func(m multihash.Multihash, _ uint64) error { candidate = append(candidate, m) return nil })) diff --git a/v2/index_gen.go b/v2/index_gen.go index 10c86dc3..b0b87453 100644 --- a/v2/index_gen.go +++ b/v2/index_gen.go @@ -37,8 +37,11 @@ func GenerateIndex(v1r io.Reader, opts ...Option) (index.Index, error) { // Note, the index is re-generated every time even if r is in CARv2 format and already has an index. // To read existing index when available see ReadOrGenerateIndex. func LoadIndex(idx index.Index, r io.Reader, opts ...Option) error { + // Parse Options. + o := ApplyOptions(opts...) + reader := internalio.ToByteReadSeeker(r) - pragma, err := carv1.ReadHeader(r) + pragma, err := carv1.ReadHeader(r, o.MaxAllowedHeaderSize) if err != nil { return fmt.Errorf("error reading car header: %w", err) } @@ -78,7 +81,7 @@ func LoadIndex(idx index.Index, r io.Reader, opts ...Option) error { dataOffset = int64(v2h.DataOffset) // Read the inner CARv1 header to skip it and sanity check it. - v1h, err := carv1.ReadHeader(reader) + v1h, err := carv1.ReadHeader(reader, o.MaxAllowedHeaderSize) if err != nil { return err } @@ -104,9 +107,6 @@ func LoadIndex(idx index.Index, r io.Reader, opts ...Option) error { // CARv2 header. sectionOffset -= dataOffset - // Parse Options. - o := ApplyOptions(opts...) - records := make([]index.Record, 0) for { // Read the section's length. @@ -188,7 +188,7 @@ func GenerateIndexFromFile(path string, opts ...Option) (index.Index, error) { // given reader to fulfill index lookup. func ReadOrGenerateIndex(rs io.ReadSeeker, opts ...Option) (index.Index, error) { // Read version. - version, err := ReadVersion(rs) + version, err := ReadVersion(rs, opts...) if err != nil { return nil, err } @@ -209,10 +209,18 @@ func ReadOrGenerateIndex(rs io.ReadSeeker, opts ...Option) (index.Index, error) } // If index is present, then no need to generate; decode and return it. if v2r.Header.HasIndex() { - return index.ReadFrom(v2r.IndexReader()) + ir, err := v2r.IndexReader() + if err != nil { + return nil, err + } + return index.ReadFrom(ir) } // Otherwise, generate index from CARv1 payload wrapped within CARv2 format. - return GenerateIndex(v2r.DataReader(), opts...) + dr, err := v2r.DataReader() + if err != nil { + return nil, err + } + return GenerateIndex(dr, opts...) default: return nil, fmt.Errorf("unknown version %v", version) } diff --git a/v2/index_gen_test.go b/v2/index_gen_test.go index 11f0530f..11011e81 100644 --- a/v2/index_gen_test.go +++ b/v2/index_gen_test.go @@ -1,6 +1,7 @@ package car_test import ( + "github.com/stretchr/testify/assert" "io" "os" "testing" @@ -13,25 +14,25 @@ import ( "github.com/multiformats/go-multicodec" "github.com/multiformats/go-multihash" "github.com/multiformats/go-varint" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestGenerateIndex(t *testing.T) { - tests := []struct { + type testCase struct { name string carPath string opts []carv2.Option wantIndexer func(t *testing.T) index.Index wantErr bool - }{ + } + tests := []testCase{ { name: "CarV1IsIndexedAsExpected", carPath: "testdata/sample-v1.car", wantIndexer: func(t *testing.T) index.Index { v1, err := os.Open("testdata/sample-v1.car") require.NoError(t, err) - defer v1.Close() + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) want, err := carv2.GenerateIndex(v1) require.NoError(t, err) return want @@ -43,10 +44,12 @@ func TestGenerateIndex(t *testing.T) { wantIndexer: func(t *testing.T) index.Index { v2, err := os.Open("testdata/sample-wrapped-v2.car") require.NoError(t, err) - defer v2.Close() + t.Cleanup(func() { assert.NoError(t, v2.Close()) }) reader, err := carv2.NewReader(v2) require.NoError(t, err) - want, err := index.ReadFrom(reader.IndexReader()) + ir, err := reader.IndexReader() + require.NoError(t, err) + want, err := index.ReadFrom(ir) require.NoError(t, err) return want }, @@ -58,7 +61,7 @@ func TestGenerateIndex(t *testing.T) { wantIndexer: func(t *testing.T) index.Index { v1, err := os.Open("testdata/sample-v1-with-zero-len-section.car") require.NoError(t, err) - defer v1.Close() + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true)) require.NoError(t, err) return want @@ -71,7 +74,7 @@ func TestGenerateIndex(t *testing.T) { wantIndexer: func(t *testing.T) index.Index { v1, err := os.Open("testdata/sample-v1-with-zero-len-section2.car") require.NoError(t, err) - defer v1.Close() + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true)) require.NoError(t, err) return want @@ -89,67 +92,49 @@ func TestGenerateIndex(t *testing.T) { wantErr: true, }, } + + requireWant := func(tt testCase, got index.Index, gotErr error) { + if tt.wantErr { + require.Error(t, gotErr) + } else { + require.NoError(t, gotErr) + var want index.Index + if tt.wantIndexer != nil { + want = tt.wantIndexer(t) + } + if want == nil { + require.Nil(t, got) + } else { + require.Equal(t, want, got) + } + } + } + for _, tt := range tests { t.Run("ReadOrGenerateIndex_"+tt.name, func(t *testing.T) { carFile, err := os.Open(tt.carPath) require.NoError(t, err) t.Cleanup(func() { assert.NoError(t, carFile.Close()) }) - got, err := carv2.ReadOrGenerateIndex(carFile, tt.opts...) - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - var want index.Index - if tt.wantIndexer != nil { - want = tt.wantIndexer(t) - } - require.Equal(t, want, got) - } + got, gotErr := carv2.ReadOrGenerateIndex(carFile, tt.opts...) + requireWant(tt, got, gotErr) }) t.Run("GenerateIndexFromFile_"+tt.name, func(t *testing.T) { - got, err := carv2.GenerateIndexFromFile(tt.carPath, tt.opts...) - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - var want index.Index - if tt.wantIndexer != nil { - want = tt.wantIndexer(t) - } - require.Equal(t, want, got) - } + got, gotErr := carv2.GenerateIndexFromFile(tt.carPath, tt.opts...) + requireWant(tt, got, gotErr) }) t.Run("LoadIndex_"+tt.name, func(t *testing.T) { carFile, err := os.Open(tt.carPath) require.NoError(t, err) got, err := index.New(multicodec.CarMultihashIndexSorted) require.NoError(t, err) - err = carv2.LoadIndex(got, carFile, tt.opts...) - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - var want index.Index - if tt.wantIndexer != nil { - want = tt.wantIndexer(t) - } - require.Equal(t, want, got) - } + gotErr := carv2.LoadIndex(got, carFile, tt.opts...) + requireWant(tt, got, gotErr) }) t.Run("GenerateIndex_"+tt.name, func(t *testing.T) { carFile, err := os.Open(tt.carPath) require.NoError(t, err) - got, err := carv2.GenerateIndex(carFile, tt.opts...) - if tt.wantErr { - require.Error(t, err) - } else { - require.NoError(t, err) - var want index.Index - if tt.wantIndexer != nil { - want = tt.wantIndexer(t) - } - require.Equal(t, want, got) - } + got, gotErr := carv2.GenerateIndex(carFile, tt.opts...) + requireWant(tt, got, gotErr) }) } } @@ -251,7 +236,7 @@ func generateMultihashSortedIndex(t *testing.T, path string) *index.MultihashInd require.NoError(t, err) t.Cleanup(func() { require.NoError(t, f.Close()) }) reader := internalio.ToByteReadSeeker(f) - header, err := carv1.ReadHeader(reader) + header, err := carv1.ReadHeader(reader, carv1.DefaultMaxAllowedHeaderSize) require.NoError(t, err) require.Equal(t, uint64(1), header.Version) diff --git a/v2/internal/carv1/car.go b/v2/internal/carv1/car.go index 48b7c86b..f62899b7 100644 --- a/v2/internal/carv1/car.go +++ b/v2/internal/carv1/car.go @@ -14,6 +14,9 @@ import ( "github.com/ipfs/go-merkledag" ) +const DefaultMaxAllowedHeaderSize uint64 = 32 << 20 // 32MiB +const DefaultMaxAllowedSectionSize uint64 = 8 << 20 // 8MiB + func init() { cbor.RegisterCborType(CarHeader{}) } @@ -56,9 +59,12 @@ func WriteCar(ctx context.Context, ds format.NodeGetter, roots []cid.Cid, w io.W return nil } -func ReadHeader(r io.Reader) (*CarHeader, error) { - hb, err := util.LdRead(r, false) +func ReadHeader(r io.Reader, maxReadBytes uint64) (*CarHeader, error) { + hb, err := util.LdRead(r, false, maxReadBytes) if err != nil { + if err == util.ErrSectionTooLarge { + err = util.ErrHeaderTooLarge + } return nil, err } @@ -106,21 +112,22 @@ func (cw *carWriter) writeNode(ctx context.Context, nd format.Node) error { } type CarReader struct { - r io.Reader - Header *CarHeader - zeroLenAsEOF bool + r io.Reader + Header *CarHeader + zeroLenAsEOF bool + maxAllowedSectionSize uint64 } func NewCarReaderWithZeroLengthSectionAsEOF(r io.Reader) (*CarReader, error) { - return newCarReader(r, true) + return NewCarReaderWithoutDefaults(r, true, DefaultMaxAllowedHeaderSize, DefaultMaxAllowedSectionSize) } func NewCarReader(r io.Reader) (*CarReader, error) { - return newCarReader(r, false) + return NewCarReaderWithoutDefaults(r, false, DefaultMaxAllowedHeaderSize, DefaultMaxAllowedSectionSize) } -func newCarReader(r io.Reader, zeroLenAsEOF bool) (*CarReader, error) { - ch, err := ReadHeader(r) +func NewCarReaderWithoutDefaults(r io.Reader, zeroLenAsEOF bool, maxAllowedHeaderSize uint64, maxAllowedSectionSize uint64) (*CarReader, error) { + ch, err := ReadHeader(r, maxAllowedHeaderSize) if err != nil { return nil, err } @@ -134,14 +141,15 @@ func newCarReader(r io.Reader, zeroLenAsEOF bool) (*CarReader, error) { } return &CarReader{ - r: r, - Header: ch, - zeroLenAsEOF: zeroLenAsEOF, + r: r, + Header: ch, + zeroLenAsEOF: zeroLenAsEOF, + maxAllowedSectionSize: maxAllowedSectionSize, }, nil } func (cr *CarReader) Next() (blocks.Block, error) { - c, data, err := util.ReadNode(cr.r, cr.zeroLenAsEOF) + c, data, err := util.ReadNode(cr.r, cr.zeroLenAsEOF, cr.maxAllowedSectionSize) if err != nil { return nil, err } diff --git a/v2/internal/carv1/util/util.go b/v2/internal/carv1/util/util.go index 6b949561..7963812e 100644 --- a/v2/internal/carv1/util/util.go +++ b/v2/internal/carv1/util/util.go @@ -1,6 +1,7 @@ package util import ( + "errors" "io" internalio "github.com/ipld/go-car/v2/internal/io" @@ -10,13 +11,16 @@ import ( cid "github.com/ipfs/go-cid" ) +var ErrSectionTooLarge = errors.New("invalid section data, length of read beyond allowable maximum") +var ErrHeaderTooLarge = errors.New("invalid header data, length of read beyond allowable maximum") + type BytesReader interface { io.Reader io.ByteReader } -func ReadNode(r io.Reader, zeroLenAsEOF bool) (cid.Cid, []byte, error) { - data, err := LdRead(r, zeroLenAsEOF) +func ReadNode(r io.Reader, zeroLenAsEOF bool, maxReadBytes uint64) (cid.Cid, []byte, error) { + data, err := LdRead(r, zeroLenAsEOF, maxReadBytes) if err != nil { return cid.Cid{}, nil, err } @@ -61,7 +65,7 @@ func LdSize(d ...[]byte) uint64 { return sum + uint64(s) } -func LdRead(r io.Reader, zeroLenAsEOF bool) ([]byte, error) { +func LdRead(r io.Reader, zeroLenAsEOF bool, maxReadBytes uint64) ([]byte, error) { l, err := varint.ReadUvarint(internalio.ToByteReader(r)) if err != nil { // If the length of bytes read is non-zero when the error is EOF then signal an unclean EOF. @@ -73,6 +77,10 @@ func LdRead(r io.Reader, zeroLenAsEOF bool) ([]byte, error) { return nil, io.EOF } + if l > maxReadBytes { // Don't OOM + return nil, ErrSectionTooLarge + } + buf := make([]byte, l) if _, err := io.ReadFull(r, buf); err != nil { return nil, err diff --git a/v2/internal/errsort/search.go b/v2/internal/errsort/search.go new file mode 100644 index 00000000..fb886179 --- /dev/null +++ b/v2/internal/errsort/search.go @@ -0,0 +1,54 @@ +/* +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +package errsort + +// Search is like sort.Search but accepts an erroring closure. +// If it errors the search is terminated immediately +func Search(n int, f func(int) (bool, error)) (int, error) { + // Define f(-1) == false and f(n) == true. + // Invariant: f(i-1) == false, f(j) == true. + i, j := 0, n + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + less, err := f(h) + if err != nil { + return 0, err + } + if !less { + i = h + 1 // preserves f(i-1) == false + } else { + j = h // preserves f(j) == true + } + } + // i == j, f(i-1) == false, and f(j) (= f(i)) == true => answer is i. + return i, nil +} diff --git a/v2/internal/io/offset_read_seeker.go b/v2/internal/io/offset_read_seeker.go index 4b701351..b3899ab7 100644 --- a/v2/internal/io/offset_read_seeker.go +++ b/v2/internal/io/offset_read_seeker.go @@ -1,63 +1,115 @@ package io -import "io" +import ( + "errors" + "io" +) var ( - _ io.ReaderAt = (*OffsetReadSeeker)(nil) - _ io.ReadSeeker = (*OffsetReadSeeker)(nil) + _ io.ReaderAt = (*offsetReadSeeker)(nil) + _ io.ReadSeeker = (*offsetReadSeeker)(nil) ) -// OffsetReadSeeker implements Read, and ReadAt on a section +// offsetReadSeeker implements Read, and ReadAt on a section // of an underlying io.ReaderAt. -// The main difference between io.SectionReader and OffsetReadSeeker is that +// The main difference between io.SectionReader and offsetReadSeeker is that // NewOffsetReadSeeker does not require the user to know the number of readable bytes. // // It also partially implements Seek, where the implementation panics if io.SeekEnd is passed. -// This is because, OffsetReadSeeker does not know the end of the file therefore cannot seek relative +// This is because, offsetReadSeeker does not know the end of the file therefore cannot seek relative // to it. -type OffsetReadSeeker struct { +type offsetReadSeeker struct { r io.ReaderAt base int64 off int64 + b [1]byte // avoid alloc in ReadByte +} + +type ReadSeekerAt interface { + io.Reader + io.ReaderAt + io.Seeker + io.ByteReader } -// NewOffsetReadSeeker returns an OffsetReadSeeker that reads from r +// NewOffsetReadSeeker returns an ReadSeekerAt that reads from r // starting offset offset off and stops with io.EOF when r reaches its end. // The Seek function will panic if whence io.SeekEnd is passed. -func NewOffsetReadSeeker(r io.ReaderAt, off int64) *OffsetReadSeeker { - return &OffsetReadSeeker{r, off, off} +func NewOffsetReadSeeker(r io.ReaderAt, off int64) (ReadSeekerAt, error) { + if or, ok := r.(*offsetReadSeeker); ok { + oldBase := or.base + newBase := or.base + off + if newBase < oldBase { + return nil, errors.New("NewOffsetReadSeeker overflow int64") + } + return &offsetReadSeeker{ + r: or.r, + base: newBase, + off: newBase, + }, nil + } + return &offsetReadSeeker{ + r: r, + base: off, + off: off, + }, nil } -func (o *OffsetReadSeeker) Read(p []byte) (n int, err error) { +func (o *offsetReadSeeker) Read(p []byte) (n int, err error) { n, err = o.r.ReadAt(p, o.off) - o.off += int64(n) + oldOffset := o.off + off := oldOffset + int64(n) + if off < oldOffset { + return 0, errors.New("ReadAt offset overflow") + } + o.off = off return } -func (o *OffsetReadSeeker) ReadAt(p []byte, off int64) (n int, err error) { +func (o *offsetReadSeeker) ReadAt(p []byte, off int64) (n int, err error) { if off < 0 { return 0, io.EOF } + oldOffset := off off += o.base + if off < oldOffset { + return 0, errors.New("ReadAt offset overflow") + } return o.r.ReadAt(p, off) } -func (o *OffsetReadSeeker) ReadByte() (byte, error) { - b := []byte{0} - _, err := o.Read(b) - return b[0], err +func (o *offsetReadSeeker) ReadByte() (byte, error) { + _, err := o.Read(o.b[:]) + return o.b[0], err } -func (o *OffsetReadSeeker) Offset() int64 { +func (o *offsetReadSeeker) Offset() int64 { return o.off } -func (o *OffsetReadSeeker) Seek(offset int64, whence int) (int64, error) { +func (o *offsetReadSeeker) Seek(offset int64, whence int) (int64, error) { switch whence { case io.SeekStart: - o.off = offset + o.base + oldOffset := offset + off := offset + o.base + if off < oldOffset { + return 0, errors.New("Seek offset overflow") + } + o.off = off case io.SeekCurrent: - o.off += offset + oldOffset := o.off + if offset < 0 { + if -offset > oldOffset { + return 0, errors.New("Seek offset underflow") + } + o.off = oldOffset + offset + } else { + off := oldOffset + offset + if off < oldOffset { + return 0, errors.New("Seek offset overflow") + } + o.off = off + } case io.SeekEnd: panic("unsupported whence: SeekEnd") } @@ -65,6 +117,6 @@ func (o *OffsetReadSeeker) Seek(offset int64, whence int) (int64, error) { } // Position returns the current position of this reader relative to the initial offset. -func (o *OffsetReadSeeker) Position() int64 { +func (o *offsetReadSeeker) Position() int64 { return o.off - o.base } diff --git a/v2/options.go b/v2/options.go index d2923b21..d2e526c4 100644 --- a/v2/options.go +++ b/v2/options.go @@ -6,11 +6,29 @@ import ( "github.com/ipld/go-car/v2/index" "github.com/ipld/go-ipld-prime/traversal" "github.com/multiformats/go-multicodec" + + "github.com/ipld/go-car/v2/internal/carv1" ) // DefaultMaxIndexCidSize specifies the maximum size in byptes accepted as a section CID by CARv2 index. const DefaultMaxIndexCidSize = 2 << 10 // 2 KiB +// DefaultMaxAllowedHeaderSize specifies the default maximum size that a CARv1 +// decode (including within a CARv2 container) will allow a header to be without +// erroring. This is to prevent OOM errors where a header prefix includes a +// too-large size specifier. +// Currently set to 32 MiB. +const DefaultMaxAllowedHeaderSize = carv1.DefaultMaxAllowedHeaderSize + +// DefaultMaxAllowedHeaderSize specifies the default maximum size that a CARv1 +// decode (including within a CARv2 container) will allow a section to be +// without erroring. This is to prevent OOM errors where a section prefix +// includes a too-large size specifier. +// Typically IPLD blocks should be under 2 MiB (ideally under 1 MiB), so unless +// atypical data is expected, this should not be a large value. +// Currently set to 8 MiB. +const DefaultMaxAllowedSectionSize = carv1.DefaultMaxAllowedSectionSize + // Option describes an option which affects behavior when interacting with CAR files. type Option func(*Options) @@ -42,14 +60,20 @@ type Options struct { MaxTraversalLinks uint64 WriteAsCarV1 bool TraversalPrototypeChooser traversal.LinkTargetNodePrototypeChooser + + MaxAllowedHeaderSize uint64 + MaxAllowedSectionSize uint64 } // ApplyOptions applies given opts and returns the resulting Options. // This function should not be used directly by end users; it's only exposed as a // side effect of Option. func ApplyOptions(opt ...Option) Options { - var opts Options - opts.MaxTraversalLinks = math.MaxInt64 //default: traverse all + opts := Options{ + MaxTraversalLinks: math.MaxInt64, //default: traverse all + MaxAllowedHeaderSize: carv1.DefaultMaxAllowedHeaderSize, + MaxAllowedSectionSize: carv1.DefaultMaxAllowedSectionSize, + } for _, o := range opt { o(&opts) } @@ -128,3 +152,23 @@ func WithTraversalPrototypeChooser(t traversal.LinkTargetNodePrototypeChooser) O o.TraversalPrototypeChooser = t } } + +// MaxAllowedHeaderSize overrides the default maximum size (of 32 MiB) that a +// CARv1 decode (including within a CARv2 container) will allow a header to be +// without erroring. +func MaxAllowedHeaderSize(max uint64) Option { + return func(o *Options) { + o.MaxAllowedHeaderSize = max + } +} + +// MaxAllowedSectionSize overrides the default maximum size (of 8 MiB) that a +// CARv1 decode (including within a CARv2 container) will allow a header to be +// without erroring. +// Typically IPLD blocks should be under 2 MiB (ideally under 1 MiB), so unless +// atypical data is expected, this should not be a large value. +func MaxAllowedSectionSize(max uint64) Option { + return func(o *Options) { + o.MaxAllowedSectionSize = max + } +} diff --git a/v2/options_test.go b/v2/options_test.go index 7e060acf..6daa473f 100644 --- a/v2/options_test.go +++ b/v2/options_test.go @@ -12,9 +12,11 @@ import ( func TestApplyOptions_SetsExpectedDefaults(t *testing.T) { require.Equal(t, carv2.Options{ - IndexCodec: multicodec.CarMultihashIndexSorted, - MaxIndexCidSize: carv2.DefaultMaxIndexCidSize, - MaxTraversalLinks: math.MaxInt64, + IndexCodec: multicodec.CarMultihashIndexSorted, + MaxIndexCidSize: carv2.DefaultMaxIndexCidSize, + MaxTraversalLinks: math.MaxInt64, + MaxAllowedHeaderSize: 32 << 20, + MaxAllowedSectionSize: 8 << 20, }, carv2.ApplyOptions()) } @@ -30,6 +32,8 @@ func TestApplyOptions_AppliesOptions(t *testing.T) { BlockstoreAllowDuplicatePuts: true, BlockstoreUseWholeCIDs: true, MaxTraversalLinks: math.MaxInt64, + MaxAllowedHeaderSize: 101, + MaxAllowedSectionSize: 202, }, carv2.ApplyOptions( carv2.UseDataPadding(123), @@ -38,6 +42,8 @@ func TestApplyOptions_AppliesOptions(t *testing.T) { carv2.ZeroLengthSectionAsEOF(true), carv2.MaxIndexCidSize(789), carv2.StoreIdentityCIDs(true), + carv2.MaxAllowedHeaderSize(101), + carv2.MaxAllowedSectionSize(202), blockstore.AllowDuplicatePuts(true), blockstore.UseWholeCIDs(true), )) diff --git a/v2/reader.go b/v2/reader.go index 9394a736..4628fd89 100644 --- a/v2/reader.go +++ b/v2/reader.go @@ -1,12 +1,19 @@ package car import ( + "errors" "fmt" "io" + "math" "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" "golang.org/x/exp/mmap" ) @@ -49,9 +56,11 @@ func NewReader(r io.ReaderAt, opts ...Option) (*Reader, error) { } cr.opts = ApplyOptions(opts...) - or := internalio.NewOffsetReadSeeker(r, 0) - var err error - cr.Version, err = ReadVersion(or) + or, err := internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return nil, err + } + cr.Version, err = ReadVersion(or, opts...) if err != nil { return nil, err } @@ -75,7 +84,11 @@ func (r *Reader) Roots() ([]cid.Cid, error) { if r.roots != nil { return r.roots, nil } - header, err := carv1.ReadHeader(r.DataReader()) + dr, err := r.DataReader() + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(dr, r.opts.MaxAllowedHeaderSize) if err != nil { return nil, err } @@ -99,9 +112,9 @@ type SectionReader interface { } // DataReader provides a reader containing the data payload in CARv1 format. -func (r *Reader) DataReader() SectionReader { +func (r *Reader) DataReader() (SectionReader, error) { if r.Version == 2 { - return io.NewSectionReader(r.r, int64(r.Header.DataOffset), int64(r.Header.DataSize)) + return io.NewSectionReader(r.r, int64(r.Header.DataOffset), int64(r.Header.DataSize)), nil } return internalio.NewOffsetReadSeeker(r.r, 0) } @@ -109,13 +122,233 @@ func (r *Reader) DataReader() SectionReader { // IndexReader provides an io.Reader containing the index for the data payload if the index is // present. Otherwise, returns nil. // Note, this function will always return nil if the backing payload represents a CARv1. -func (r *Reader) IndexReader() io.Reader { +func (r *Reader) IndexReader() (io.Reader, error) { if r.Version == 1 || !r.Header.HasIndex() { - return nil + return nil, nil } return internalio.NewOffsetReadSeeker(r.r, int64(r.Header.IndexOffset)) } +// Stats is returned by an Inspect() call +type Stats struct { + Version uint64 + Header Header + Roots []cid.Cid + RootsPresent bool + BlockCount uint64 + CodecCounts map[multicodec.Code]uint64 + MhTypeCounts map[multicodec.Code]uint64 + AvgCidLength uint64 + MaxCidLength uint64 + MinCidLength uint64 + AvgBlockLength uint64 + MaxBlockLength uint64 + MinBlockLength uint64 + IndexCodec multicodec.Code +} + +// Inspect does a quick scan of a CAR, performing basic validation of the format +// and returning a Stats object that provides a high-level description of the +// contents of the CAR. +// Inspect works for CARv1 and CARv2 contents. A CARv1 will return an +// uninitialized Header value. +// +// If validateBlockHash is true, all block data in the payload will be hashed +// and compared to the CID for that block and an error will return if there +// is a mismatch. If false, block data will be skipped over and not checked. +// Performing a full block hash validation is similar to using a BlockReader and +// calling Next over all blocks. +// +// Inspect will perform a basic check of a CARv2 index, where present, but this +// does not guarantee that the index is correct. Attempting to read index data +// from untrusted sources is not recommended. If required, further validation of +// an index can be performed by loading the index and performing a ForEach() and +// sanity checking that the offsets are within the data payload section of the +// CAR. However, re-generation of index data in this case is the recommended +// course of action. +// +// Beyond the checks performed by Inspect, a valid / good CAR is somewhat +// use-case dependent. Factors to consider include: +// +// * Bad indexes, including incorrect offsets, duplicate entries, or other +// faulty data. Indexes should be re-generated, regardless, if you need to use +// them and have any reason to not trust the source. +// +// * Blocks use codecs that your system doesn't have access to—which may mean +// you can't traverse a DAG or use the contained data. Stats.CodecCounts +// contains a list of codecs found in the CAR so this can be checked. +// +// * CIDs use multihashes that your system doesn't have access to—which will +// mean you can't validate block hashes are correct (using validateBlockHash +// in this case will result in a failure). Stats.MhTypeCounts contains a +// list of multihashes found in the CAR so this can be checked. +// +// * The presence of IDENTITY CIDs, which may not be supported (or desired) by +// the consumer of the CAR. Stats.CodecCounts can determine the presence +// of IDENTITY CIDs. +// +// * Roots: the number of roots, duplicates, and whether they are related to the +// blocks contained within the CAR. Stats contains a list of Roots and a +// RootsPresent bool so further checks can be performed. +// +// * DAG completeness is not checked. Any properties relating to the DAG, or +// DAGs contained within a CAR are the responsibility of the user to check. +func (r *Reader) Inspect(validateBlockHash bool) (Stats, error) { + stats := Stats{ + Version: r.Version, + Header: r.Header, + CodecCounts: make(map[multicodec.Code]uint64), + MhTypeCounts: make(map[multicodec.Code]uint64), + } + + var totalCidLength uint64 + var totalBlockLength uint64 + var minCidLength uint64 = math.MaxUint64 + var minBlockLength uint64 = math.MaxUint64 + + dr, err := r.DataReader() + if err != nil { + return Stats{}, err + } + bdr := internalio.ToByteReader(dr) + + // read roots, not using Roots(), because we need the offset setup in the data trader + header, err := carv1.ReadHeader(dr, r.opts.MaxAllowedHeaderSize) + if err != nil { + return Stats{}, err + } + stats.Roots = header.Roots + var rootsPresentCount int + rootsPresent := make([]bool, len(stats.Roots)) + + // read block sections + for { + sectionLength, err := varint.ReadUvarint(bdr) + if err != nil { + if err == io.EOF { + // if the length of bytes read is non-zero when the error is EOF then signal an unclean EOF. + if sectionLength > 0 { + return Stats{}, io.ErrUnexpectedEOF + } + // otherwise, this is a normal ending + break + } + return Stats{}, err + } + if sectionLength == 0 && r.opts.ZeroLengthSectionAsEOF { + // normal ending for this read mode + break + } + if sectionLength > r.opts.MaxAllowedSectionSize { + return Stats{}, util.ErrSectionTooLarge + } + + // decode just the CID bytes + cidLen, c, err := cid.CidFromReader(dr) + if err != nil { + return Stats{}, err + } + + if sectionLength < uint64(cidLen) { + // this case is handled different in the normal ReadNode() path since it + // slurps in the whole section bytes and decodes CID from there - so an + // error should come from a failing io.ReadFull + return Stats{}, errors.New("section length shorter than CID length") + } + + // is this a root block? (also account for duplicate root CIDs) + if rootsPresentCount < len(stats.Roots) { + for i, r := range stats.Roots { + if !rootsPresent[i] && c == r { + rootsPresent[i] = true + rootsPresentCount++ + } + } + } + + cp := c.Prefix() + codec := multicodec.Code(cp.Codec) + count := stats.CodecCounts[codec] + stats.CodecCounts[codec] = count + 1 + mhtype := multicodec.Code(cp.MhType) + count = stats.MhTypeCounts[mhtype] + stats.MhTypeCounts[mhtype] = count + 1 + + blockLength := sectionLength - uint64(cidLen) + + if validateBlockHash { + // Use multihash.SumStream to avoid having to copy the entire block content into memory. + // The SumStream uses a buffered copy to write bytes into the hasher which will take + // advantage of streaming hash calculation depending on the hash function. + // TODO: introduce SumStream in go-cid to simplify the code here. + blockReader := io.LimitReader(dr, int64(blockLength)) + mhl := cp.MhLength + if mhtype == multicodec.Identity { + mhl = -1 + } + mh, err := multihash.SumStream(blockReader, cp.MhType, mhl) + if err != nil { + return Stats{}, err + } + var gotCid cid.Cid + switch cp.Version { + case 0: + gotCid = cid.NewCidV0(mh) + case 1: + gotCid = cid.NewCidV1(cp.Codec, mh) + default: + return Stats{}, fmt.Errorf("invalid cid version: %d", cp.Version) + } + if !gotCid.Equals(c) { + return Stats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, gotCid) + } + } else { + // otherwise, skip over it + if _, err := dr.Seek(int64(blockLength), io.SeekCurrent); err != nil { + return Stats{}, err + } + } + + stats.BlockCount++ + totalCidLength += uint64(cidLen) + totalBlockLength += blockLength + if uint64(cidLen) < minCidLength { + minCidLength = uint64(cidLen) + } + if uint64(cidLen) > stats.MaxCidLength { + stats.MaxCidLength = uint64(cidLen) + } + if blockLength < minBlockLength { + minBlockLength = blockLength + } + if blockLength > stats.MaxBlockLength { + stats.MaxBlockLength = blockLength + } + } + + stats.RootsPresent = len(stats.Roots) == rootsPresentCount + if stats.BlockCount > 0 { + stats.MinCidLength = minCidLength + stats.MinBlockLength = minBlockLength + stats.AvgCidLength = totalCidLength / stats.BlockCount + stats.AvgBlockLength = totalBlockLength / stats.BlockCount + } + + if stats.Version != 1 && stats.Header.HasIndex() { + idxr, err := r.IndexReader() + if err != nil { + return Stats{}, err + } + idx, err := index.ReadFrom(idxr) + if err != nil { + return Stats{}, err + } + stats.IndexCodec = idx.Codec() + } + + return stats, nil +} + // Close closes the underlying reader if it was opened by OpenReader. func (r *Reader) Close() error { if r.closer != nil { @@ -126,8 +359,9 @@ func (r *Reader) Close() error { // ReadVersion reads the version from the pragma. // This function accepts both CARv1 and CARv2 payloads. -func ReadVersion(r io.Reader) (uint64, error) { - header, err := carv1.ReadHeader(r) +func ReadVersion(r io.Reader, opts ...Option) (uint64, error) { + o := ApplyOptions(opts...) + header, err := carv1.ReadHeader(r, o.MaxAllowedHeaderSize) if err != nil { return 0, err } diff --git a/v2/reader_test.go b/v2/reader_test.go index a0c6e3cd..cdeb74d7 100644 --- a/v2/reader_test.go +++ b/v2/reader_test.go @@ -1,13 +1,18 @@ package car_test import ( + "bytes" + "encoding/hex" "io" "os" + "strings" "testing" + "github.com/ipfs/go-cid" carv2 "github.com/ipld/go-car/v2" "github.com/ipld/go-car/v2/index" "github.com/ipld/go-car/v2/internal/carv1" + "github.com/multiformats/go-multicodec" "github.com/stretchr/testify/require" ) @@ -135,7 +140,9 @@ func TestReader_WithCarV1Consistency(t *testing.T) { gotRoots, err := subject.Roots() require.NoError(t, err) require.Equal(t, wantReader.Header.Roots, gotRoots) - require.Nil(t, subject.IndexReader()) + ir, err := subject.IndexReader() + require.Nil(t, ir) + require.NoError(t, err) }) } } @@ -167,11 +174,14 @@ func TestReader_WithCarV2Consistency(t *testing.T) { require.NoError(t, err) require.Equal(t, wantReader.Header.Roots, gotRoots) - gotIndexReader := subject.IndexReader() + gotIndexReader, err := subject.IndexReader() + require.NoError(t, err) require.NotNil(t, gotIndexReader) gotIndex, err := index.ReadFrom(gotIndexReader) require.NoError(t, err) - wantIndex, err := carv2.GenerateIndex(subject.DataReader()) + dr, err := subject.DataReader() + require.NoError(t, err) + wantIndex, err := carv2.GenerateIndex(dr) require.NoError(t, err) require.Equal(t, wantIndex, gotIndex) }) @@ -181,8 +191,10 @@ func TestReader_WithCarV2Consistency(t *testing.T) { func TestOpenReader_DoesNotPanicForReadersCreatedBeforeClosure(t *testing.T) { subject, err := carv2.OpenReader("testdata/sample-wrapped-v2.car") require.NoError(t, err) - dReaderBeforeClosure := subject.DataReader() - iReaderBeforeClosure := subject.IndexReader() + dReaderBeforeClosure, err := subject.DataReader() + require.NoError(t, err) + iReaderBeforeClosure, err := subject.IndexReader() + require.NoError(t, err) require.NoError(t, subject.Close()) buf := make([]byte, 1) @@ -199,8 +211,10 @@ func TestOpenReader_DoesNotPanicForReadersCreatedAfterClosure(t *testing.T) { subject, err := carv2.OpenReader("testdata/sample-wrapped-v2.car") require.NoError(t, err) require.NoError(t, subject.Close()) - dReaderAfterClosure := subject.DataReader() - iReaderAfterClosure := subject.IndexReader() + dReaderAfterClosure, err := subject.DataReader() + require.NoError(t, err) + iReaderAfterClosure, err := subject.IndexReader() + require.NoError(t, err) buf := make([]byte, 1) panicTest := func(r io.Reader) { @@ -231,7 +245,9 @@ func TestReader_ReturnsNilWhenThereIsNoIndex(t *testing.T) { subject, err := carv2.OpenReader(tt.path) require.NoError(t, err) t.Cleanup(func() { require.NoError(t, subject.Close()) }) - require.Nil(t, subject.IndexReader()) + ir, err := subject.IndexReader() + require.NoError(t, err) + require.Nil(t, ir) }) } } @@ -267,3 +283,280 @@ func requireNewCarV1Reader(t *testing.T, r io.Reader, zerLenAsEOF bool) *carv1.C require.NoError(t, err) return cr } + +func TestInspect(t *testing.T) { + tests := []struct { + name string + path string + carHex string + zerLenAsEOF bool + expectedStats carv2.Stats + }{ + { + name: "IndexlessCarV2", + path: "testdata/sample-v2-indexless.car", + expectedStats: carv2.Stats{ + Version: 2, + Header: carv2.Header{ + Characteristics: carv2.Characteristics{0, 0}, + DataOffset: 51, + DataSize: 479907, + IndexOffset: 0, + }, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // same payload as IndexlessCarV2, so only difference is the Version & Header + name: "CarV1", + path: "testdata/sample-v1.car", + expectedStats: carv2.Stats{ + Version: 1, + Header: carv2.Header{}, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // same payload as IndexlessCarV2, so only difference is the Header + name: "CarV2ProducedByBlockstore", + path: "testdata/sample-rw-bs-v2.car", + expectedStats: carv2.Stats{ + Version: 2, + Header: carv2.Header{ + DataOffset: 1464, + DataSize: 273, + IndexOffset: 1737, + }, + Roots: []cid.Cid{ + mustCidDecode("bafkreifuosuzujyf4i6psbneqtwg2fhplc2wxptc5euspa2gn3bwhnihfu"), + mustCidDecode("bafkreifc4hca3inognou377hfhvu2xfchn2ltzi7yu27jkaeujqqqdbjju"), + mustCidDecode("bafkreig5lvr4l6b4fr3un4xvzeyt3scevgsqjgrhlnwxw2unwbn5ro276u"), + }, + RootsPresent: true, + BlockCount: 3, + CodecCounts: map[multicodec.Code]uint64{multicodec.Raw: 3}, + MhTypeCounts: map[multicodec.Code]uint64{multicodec.Sha2_256: 3}, + AvgCidLength: 36, + MaxCidLength: 36, + MinCidLength: 36, + AvgBlockLength: 6, + MaxBlockLength: 9, + MinBlockLength: 4, + IndexCodec: multicodec.CarMultihashIndexSorted, + }, + }, + // same as CarV1 but with a zero-byte EOF to test options + { + name: "CarV1VersionWithZeroLenSectionIsOne", + path: "testdata/sample-v1-with-zero-len-section.car", + zerLenAsEOF: true, + expectedStats: carv2.Stats{ + Version: 1, + Header: carv2.Header{}, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // A case where this _could_ be a valid CAR if we allowed identity CIDs + // and not matching block contents to exist, there's no block bytes in + // this. It will only fail if you don't validate the CID matches the, + // bytes (see TestInspectError for that case). + name: "IdentityCID", + // 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"}) + carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d", + expectedStats: carv2.Stats{ + Version: 1, + Roots: []cid.Cid{mustCidDecode("baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5")}, + RootsPresent: true, + BlockCount: 1, + CodecCounts: map[multicodec.Code]uint64{multicodec.DagJson: 1}, + MhTypeCounts: map[multicodec.Code]uint64{multicodec.Identity: 1}, + AvgCidLength: 25, + MaxCidLength: 25, + MinCidLength: 25, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var reader *carv2.Reader + var err error + if tt.path != "" { + reader, err = carv2.OpenReader(tt.path, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + } else { + byts, err := hex.DecodeString(strings.ReplaceAll(tt.carHex, " ", "")) + require.NoError(t, err) + reader, err = carv2.NewReader(bytes.NewReader(byts), carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + } + t.Cleanup(func() { require.NoError(t, reader.Close()) }) + stats, err := reader.Inspect(false) + require.NoError(t, err) + require.Equal(t, tt.expectedStats, stats) + }) + } +} + +func TestInspectError(t *testing.T) { + tests := []struct { + name string + carHex string + expectedOpenError string + expectedInspectError string + validateBlockHash bool + }{ + { + name: "BadCidV0", + carHex: "3aa265726f6f747381d8305825000130302030303030303030303030303030303030303030303030303030303030303030306776657273696f6e010130", + expectedInspectError: "expected 1 as the cid version number, got: 48", + }, + { + name: "BadHeaderLength", + carHex: "e0e0e0e0a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e19", + expectedOpenError: "invalid header data, length of read beyond allowable maximum", + }, + { + name: "BadSectionLength", + carHex: "11a265726f6f7473806776657273696f6e01e0e0e0e0a7060155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000", + expectedInspectError: "invalid section data, length of read beyond allowable maximum", + }, + { + name: "BadSectionLength2", + carHex: "3aa265726f6f747381d8305825000130302030303030303030303030303030303030303030303030303030303030303030306776657273696f6e01200130302030303030303030303030303030303030303030303030303030303030303030303030303030303030", + expectedInspectError: "section length shorter than CID length", + validateBlockHash: true, + }, + { + name: "BadSectionLength3", + carHex: "11a265726f6f7473f66776657273696f6e0180", + expectedInspectError: "unexpected EOF", + }, + { + name: "BadBlockHash(SanityCheck)", // this should pass because we don't ask the CID be validated even though it doesn't match + // header cid data + carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff", + }, + { + name: "BadBlockHash", // same as above, but we ask for CID validation + // header cid data + carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff", + validateBlockHash: true, + expectedInspectError: "mismatch in content integrity, expected: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi, got: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm", + }, + { + name: "IdentityCID", // a case where this _could_ be a valid CAR if we allowed identity CIDs and not matching block contents to exist, there's no block bytes in this + // 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"}) + carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d", + validateBlockHash: true, + expectedInspectError: "mismatch in content integrity, expected: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5, got: baguqeaaa", + }, + // the bad index tests are manually constructed from this single-block CARv2 by adjusting the Uint32 and Uint64 values in the index: + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + // 0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000 + { + name: "BadIndexCountOverflow", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + carHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 ffffffff 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + expectedInspectError: "index too big; MultihashIndexSorted count is overflowing int32", + }, + { + name: "BadIndexCountTooMany", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + carHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 ffffff7f 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + expectedInspectError: "unexpected EOF", + }, + { + name: "BadIndexMultiWidthOverflow", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + carHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 ffffffff 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + expectedInspectError: "index too big; multiWidthIndex count is overflowing int32", + }, + { + name: "BadIndexMultiWidthTooMany", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + carHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 ffffff7f 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + expectedInspectError: "unexpected EOF", + }, + // we don't test any further into the index, to do that, a user should do a ForEach across the loaded index (and sanity check the offsets) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + car, _ := hex.DecodeString(strings.ReplaceAll(tt.carHex, " ", "")) + reader, err := carv2.NewReader(bytes.NewReader(car)) + if tt.expectedOpenError != "" { + require.Error(t, err) + require.Equal(t, tt.expectedOpenError, err.Error()) + return + } else { + require.NoError(t, err) + } + t.Cleanup(func() { require.NoError(t, reader.Close()) }) + _, err = reader.Inspect(tt.validateBlockHash) + if tt.expectedInspectError != "" { + require.Error(t, err) + require.Equal(t, tt.expectedInspectError, err.Error()) + } else { + require.NoError(t, err) + } + }) + } +} + +func mustCidDecode(s string) cid.Cid { + c, err := cid.Decode(s) + if err != nil { + panic(err) + } + return c +} diff --git a/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 b/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 new file mode 100644 index 00000000..ae9262d4 --- /dev/null +++ b/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xff\x80\xaa\x95\xa6sion\x01,\x01q\x12 \x15\x1f\xe9\xe7