Merge pull request #267 from ipld/raw-loads

linking: add LoadRaw and LoadPlusRaw functions to LinkSystem.
ipld · Oct 22, 2021 · ebf675a · ebf675a
2 parents 911ba04 + f7b2b80
commit ebf675a
Showing 1 changed file with 129 additions and 5 deletions.
diff --git a/linking/functions.go b/linking/functions.go
@@ -1,6 +1,7 @@
 package linking
 
 import (
+ "bytes"
  "context"
  "io"
 
@@ -12,6 +13,7 @@ import (
 
 // Varations:
 // - Load vs Store vs ComputeLink
+// - Load vs LoadPlusRaw
 // - With or without LinkContext?
 // - Brevity would be nice but I can't think of what to name the functions, so: everything takes LinkContext. Zero value is fine though.
 // - [for load direction only]: Prototype (and return Node|error) or Assembler (and just return error)?
@@ -21,6 +23,31 @@ import (
 // Can we get as far as a `QuickLoad(lnk Link) (Node, error)` function, which doesn't even ask you for a NodePrototype?
 // No, not quite. (Alas.) If we tried to do so, and make it use `basicnode.Prototype`, we'd have import cycles; ded.
 
+// Load looks up some data identified by a Link, and does everything necessary to turn it into usable data.
+// In detail, that means it:
+// brings that data into memory,
+// verifies the hash,
+// parses it into the Data Model using a codec,
+// and returns an IPLD Node.
+//
+// Where the data will be loaded from is determined by the configuration of the LinkSystem
+// (namely, the StorageReadOpener callback, which can either be set directly,
+// or configured via the SetReadStorage function).
+//
+// The in-memory form used for the returned Node is determined by the given NodePrototype parameter.
+// A new builder and a new node will be allocated, via NodePrototype.NewBuilder.
+// (If you'd like more control over memory allocation, you may wish to see the Fill function instead.)
+//
+// A schema may also be used, and apply additional data validation during loading,
+// by using a schema.TypedNodePrototype as the NodePrototype argument.
+//
+// The LinkContext parameter may be used to pass contextual information down to the loading layer.
+//
+// Which hashing function is used to validate the loaded data is determined by LinkSystem.HasherChooser.
+// Which codec is used to parse the loaded data into the Data Model is determined by LinkSystem.DecoderChooser.
+//
+// The LinkSystem.NodeReifier callback is also applied before returning the Node,
+// and so Load may also thereby return an ADL.
 func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, error) {
  nb := np.NewBuilder()
  if err := lsys.Fill(lnkCtx, lnk, nb); err != nil {
@@ -33,6 +60,9 @@ func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamode
  return lsys.NodeReifier(lnkCtx, nd, lsys)
 }
 
+// MustLoad is identical to Load, but panics in the case of errors.
+//
+// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
 func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) datamodel.Node {
  if n, err := lsys.Load(lnkCtx, lnk, np); err != nil {
  panic(err)
@@ -41,6 +71,88 @@ func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np data
  }
 }
 
+// LoadPlusRaw is similar to Load, but additionally retains and returns the byte slice of the raw data parsed.
+//
+// Be wary of using this with large data, since it will hold all data in memory at once.
+// For more control over streaming, you may want to construct a LinkSystem where you wrap the storage opener callbacks,
+// and thus can access the streams (and tee them, or whatever you need to do) as they're opened.
+// This function is meant for convenience when data sizes are small enough that fitting them into memory at once is not a problem.
+func (lsys *LinkSystem) LoadPlusRaw(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, []byte, error) {
+ // Choose all the parts.
+ decoder, err := lsys.DecoderChooser(lnk)
+ if err != nil {
+ return nil, nil, ErrLinkingSetup{"could not choose a decoder", err}
+ }
+ // Use LoadRaw to get the data.
+ // If we're going to have everything in memory at once, we might as well do that first, and then give the codec and the hasher the whole thing at once.
+ block, err := lsys.LoadRaw(lnkCtx, lnk)
+ if err != nil {
+ return nil, block, err
+ }
+ // Create a NodeBuilder.
+ // Deploy the codec.
+ // Build the node.
+ nb := np.NewBuilder()
+ if err := decoder(nb, bytes.NewBuffer(block)); err != nil {
+ return nil, block, err
+ }
+ nd := nb.Build()
+ // Consider applying NodeReifier, if applicable.
+ if lsys.NodeReifier == nil {
+ return nd, block, nil
+ }
+ nd, err = lsys.NodeReifier(lnkCtx, nd, lsys)
+ return nd, block, err
+}
+
+// LoadRaw looks up some data identified by a Link, brings that data into memory,
+// verifies the hash, and returns it directly as a byte slice.
+//
+// LoadRaw does not return a data model view of the data,
+// nor does it verify that a codec can parse the data at all!
+// Use this function at your own risk; it does not provide the same guarantees as the Load or Fill functions do.
+func (lsys *LinkSystem) LoadRaw(lnkCtx LinkContext, lnk datamodel.Link) ([]byte, error) {
+ if lnkCtx.Ctx == nil {
+ lnkCtx.Ctx = context.Background()
+ }
+ // Choose all the parts.
+ hasher, err := lsys.HasherChooser(lnk.Prototype())
+ if err != nil {
+ return nil, ErrLinkingSetup{"could not choose a hasher", err}
+ }
+ if lsys.StorageReadOpener == nil {
+ return nil, ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
+ }
+ // Open storage: get the data.
+ // FUTURE: this could probably use storage.ReadableStorage.Get instead of streaming and a buffer, if we refactored LinkSystem to carry that interface through.
+ reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
+ if err != nil {
+ return nil, err
+ }
+ var buf bytes.Buffer
+ if _, err := io.Copy(&buf, reader); err != nil {
+ return nil, err
+ }
+ // Compute the hash.
+ // (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
+ hasher.Write(buf.Bytes())
+ hash := hasher.Sum(nil)
+ lnk2 := lnk.Prototype().BuildLink(hash)
+ if lnk2 != lnk {
+ return nil, ErrHashMismatch{Actual: lnk2, Expected: lnk}
+ }
+ // No codec to deploy; this is the raw load function.
+ // So we're done.
+ return buf.Bytes(), nil
+}
+
+// Fill is similar to Load, but allows more control over memory allocations.
+// Instead of taking a NodePrototype parameter, Fill takes a NodeAssembler parameter:
+// this allows you to use your own NodeBuilder (and reset it, etc, thus controlling allocations),
+// or, to fill in some part of a larger structure.
+//
+// Note that Fill does not regard NodeReifier, even if one has been configured.
+// (This is in contrast to Load, which does regard a NodeReifier if one is configured, and thus may return an ADL node).
 func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) error {
  if lnkCtx.Ctx == nil {
  lnkCtx.Ctx = context.Background()
@@ -57,38 +169,50 @@ func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamode
  if lsys.StorageReadOpener == nil {
  return ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
  }
- // Open storage, read it, verify it, and feed the codec to assemble the nodes.
+ // Open storage; get a reader stream.
  reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
  if err != nil {
  return err
  }
- // TrustaedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
+ // TrustedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
  // As a result, we can skip rehashing it
  if lsys.TrustedStorage {
  return decoder(na, reader)
  }
  // Tee the stream so that the hasher is fed as the unmarshal progresses through the stream.
  tee := io.TeeReader(reader, hasher)
+ // The actual read is then dragged forward by the codec.
  decodeErr := decoder(na, tee)
- if decodeErr != nil { // It is important to security to check the hash before returning any other observation about the content.
- // This copy is for data remaining the block that wasn't already pulled through the TeeReader by the decoder.
+ if decodeErr != nil {
+ // It is important to security to check the hash before returning any other observation about the content,
+ // so, if the decode process returns any error, we have several steps to take before potentially returning it.
+ // First, we try to copy any data remaining that wasn't already pulled through the TeeReader by the decoder,
+ // so that the hasher can reach the end of the stream.
+ // If _that_ errors, return the I/O level error.
+ // We hang onto decodeErr for a while: we can't return that until all the way after we check the hash equality.
  _, err := io.Copy(hasher, reader)
  if err != nil {
  return err
  }
  }
+ // Compute the hash.
+ // (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
  hash := hasher.Sum(nil)
- // Bit of a jig to get something we can do the hash equality check on.
  lnk2 := lnk.Prototype().BuildLink(hash)
  if lnk2 != lnk {
  return ErrHashMismatch{Actual: lnk2, Expected: lnk}
  }
+ // If we got all the way through IO and through the hash check:
+ // now, finally, if we did get an error from the codec, we can admit to that.
  if decodeErr != nil {
  return decodeErr
  }
  return nil
 }
 
+// MustFill is identical to Fill, but panics in the case of errors.
+//
+// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
 func (lsys *LinkSystem) MustFill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) {
  if err := lsys.Fill(lnkCtx, lnk, na); err != nil {
  panic(err)