Skip to content

Commit

Permalink
Merge pull request #267 from ipld/raw-loads
Browse files Browse the repository at this point in the history
linking: add LoadRaw and LoadPlusRaw functions to LinkSystem.
  • Loading branch information
warpfork committed Oct 22, 2021
2 parents 911ba04 + f7b2b80 commit ebf675a
Showing 1 changed file with 129 additions and 5 deletions.
134 changes: 129 additions & 5 deletions linking/functions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package linking

import (
"bytes"
"context"
"io"

Expand All @@ -12,6 +13,7 @@ import (

// Varations:
// - Load vs Store vs ComputeLink
// - Load vs LoadPlusRaw
// - With or without LinkContext?
// - Brevity would be nice but I can't think of what to name the functions, so: everything takes LinkContext. Zero value is fine though.
// - [for load direction only]: Prototype (and return Node|error) or Assembler (and just return error)?
Expand All @@ -21,6 +23,31 @@ import (
// Can we get as far as a `QuickLoad(lnk Link) (Node, error)` function, which doesn't even ask you for a NodePrototype?
// No, not quite. (Alas.) If we tried to do so, and make it use `basicnode.Prototype`, we'd have import cycles; ded.

// Load looks up some data identified by a Link, and does everything necessary to turn it into usable data.
// In detail, that means it:
// brings that data into memory,
// verifies the hash,
// parses it into the Data Model using a codec,
// and returns an IPLD Node.
//
// Where the data will be loaded from is determined by the configuration of the LinkSystem
// (namely, the StorageReadOpener callback, which can either be set directly,
// or configured via the SetReadStorage function).
//
// The in-memory form used for the returned Node is determined by the given NodePrototype parameter.
// A new builder and a new node will be allocated, via NodePrototype.NewBuilder.
// (If you'd like more control over memory allocation, you may wish to see the Fill function instead.)
//
// A schema may also be used, and apply additional data validation during loading,
// by using a schema.TypedNodePrototype as the NodePrototype argument.
//
// The LinkContext parameter may be used to pass contextual information down to the loading layer.
//
// Which hashing function is used to validate the loaded data is determined by LinkSystem.HasherChooser.
// Which codec is used to parse the loaded data into the Data Model is determined by LinkSystem.DecoderChooser.
//
// The LinkSystem.NodeReifier callback is also applied before returning the Node,
// and so Load may also thereby return an ADL.
func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, error) {
nb := np.NewBuilder()
if err := lsys.Fill(lnkCtx, lnk, nb); err != nil {
Expand All @@ -33,6 +60,9 @@ func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamode
return lsys.NodeReifier(lnkCtx, nd, lsys)
}

// MustLoad is identical to Load, but panics in the case of errors.
//
// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) datamodel.Node {
if n, err := lsys.Load(lnkCtx, lnk, np); err != nil {
panic(err)
Expand All @@ -41,6 +71,88 @@ func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np data
}
}

// LoadPlusRaw is similar to Load, but additionally retains and returns the byte slice of the raw data parsed.
//
// Be wary of using this with large data, since it will hold all data in memory at once.
// For more control over streaming, you may want to construct a LinkSystem where you wrap the storage opener callbacks,
// and thus can access the streams (and tee them, or whatever you need to do) as they're opened.
// This function is meant for convenience when data sizes are small enough that fitting them into memory at once is not a problem.
func (lsys *LinkSystem) LoadPlusRaw(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, []byte, error) {
// Choose all the parts.
decoder, err := lsys.DecoderChooser(lnk)
if err != nil {
return nil, nil, ErrLinkingSetup{"could not choose a decoder", err}
}
// Use LoadRaw to get the data.
// If we're going to have everything in memory at once, we might as well do that first, and then give the codec and the hasher the whole thing at once.
block, err := lsys.LoadRaw(lnkCtx, lnk)
if err != nil {
return nil, block, err
}
// Create a NodeBuilder.
// Deploy the codec.
// Build the node.
nb := np.NewBuilder()
if err := decoder(nb, bytes.NewBuffer(block)); err != nil {
return nil, block, err
}
nd := nb.Build()
// Consider applying NodeReifier, if applicable.
if lsys.NodeReifier == nil {
return nd, block, nil
}
nd, err = lsys.NodeReifier(lnkCtx, nd, lsys)
return nd, block, err
}

// LoadRaw looks up some data identified by a Link, brings that data into memory,
// verifies the hash, and returns it directly as a byte slice.
//
// LoadRaw does not return a data model view of the data,
// nor does it verify that a codec can parse the data at all!
// Use this function at your own risk; it does not provide the same guarantees as the Load or Fill functions do.
func (lsys *LinkSystem) LoadRaw(lnkCtx LinkContext, lnk datamodel.Link) ([]byte, error) {
if lnkCtx.Ctx == nil {
lnkCtx.Ctx = context.Background()
}
// Choose all the parts.
hasher, err := lsys.HasherChooser(lnk.Prototype())
if err != nil {
return nil, ErrLinkingSetup{"could not choose a hasher", err}
}
if lsys.StorageReadOpener == nil {
return nil, ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
}
// Open storage: get the data.
// FUTURE: this could probably use storage.ReadableStorage.Get instead of streaming and a buffer, if we refactored LinkSystem to carry that interface through.
reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
if err != nil {
return nil, err
}
var buf bytes.Buffer
if _, err := io.Copy(&buf, reader); err != nil {
return nil, err
}
// Compute the hash.
// (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
hasher.Write(buf.Bytes())
hash := hasher.Sum(nil)
lnk2 := lnk.Prototype().BuildLink(hash)
if lnk2 != lnk {
return nil, ErrHashMismatch{Actual: lnk2, Expected: lnk}
}
// No codec to deploy; this is the raw load function.
// So we're done.
return buf.Bytes(), nil
}

// Fill is similar to Load, but allows more control over memory allocations.
// Instead of taking a NodePrototype parameter, Fill takes a NodeAssembler parameter:
// this allows you to use your own NodeBuilder (and reset it, etc, thus controlling allocations),
// or, to fill in some part of a larger structure.
//
// Note that Fill does not regard NodeReifier, even if one has been configured.
// (This is in contrast to Load, which does regard a NodeReifier if one is configured, and thus may return an ADL node).
func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) error {
if lnkCtx.Ctx == nil {
lnkCtx.Ctx = context.Background()
Expand All @@ -57,38 +169,50 @@ func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamode
if lsys.StorageReadOpener == nil {
return ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
}
// Open storage, read it, verify it, and feed the codec to assemble the nodes.
// Open storage; get a reader stream.
reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
if err != nil {
return err
}
// TrustaedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
// TrustedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
// As a result, we can skip rehashing it
if lsys.TrustedStorage {
return decoder(na, reader)
}
// Tee the stream so that the hasher is fed as the unmarshal progresses through the stream.
tee := io.TeeReader(reader, hasher)
// The actual read is then dragged forward by the codec.
decodeErr := decoder(na, tee)
if decodeErr != nil { // It is important to security to check the hash before returning any other observation about the content.
// This copy is for data remaining the block that wasn't already pulled through the TeeReader by the decoder.
if decodeErr != nil {
// It is important to security to check the hash before returning any other observation about the content,
// so, if the decode process returns any error, we have several steps to take before potentially returning it.
// First, we try to copy any data remaining that wasn't already pulled through the TeeReader by the decoder,
// so that the hasher can reach the end of the stream.
// If _that_ errors, return the I/O level error.
// We hang onto decodeErr for a while: we can't return that until all the way after we check the hash equality.
_, err := io.Copy(hasher, reader)
if err != nil {
return err
}
}
// Compute the hash.
// (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
hash := hasher.Sum(nil)
// Bit of a jig to get something we can do the hash equality check on.
lnk2 := lnk.Prototype().BuildLink(hash)
if lnk2 != lnk {
return ErrHashMismatch{Actual: lnk2, Expected: lnk}
}
// If we got all the way through IO and through the hash check:
// now, finally, if we did get an error from the codec, we can admit to that.
if decodeErr != nil {
return decodeErr
}
return nil
}

// MustFill is identical to Fill, but panics in the case of errors.
//
// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
func (lsys *LinkSystem) MustFill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) {
if err := lsys.Fill(lnkCtx, lnk, na); err != nil {
panic(err)
Expand Down

0 comments on commit ebf675a

Please sign in to comment.