-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: Compute an uncompressed digest for chunked layers #2155
base: main
Are you sure you want to change the base?
Changes from all commits
899c859
5268287
787ec97
4c9e6d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package composefs | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/opencontainers/go-digest" | ||
) | ||
|
||
// RegularFilePath returns the path used in the composefs backing store for a | ||
// regular file with the provided content digest. | ||
// | ||
// The caller MUST ensure d is a valid digest (in particular, that it contains no path separators or .. entries) | ||
func RegularFilePathForValidatedDigest(d digest.Digest) (string, error) { | ||
if algo := d.Algorithm(); algo != digest.SHA256 { | ||
return "", fmt.Errorf("unexpected digest algorithm %q", algo) | ||
} | ||
e := d.Encoded() | ||
return e[0:2] + "/" + e[2:], nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package composefs | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/opencontainers/go-digest" | ||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestRegularFilePathForValidatedDigest(t *testing.T) { | ||
d, err := digest.Parse("sha256:0123456789abcdef1123456789abcdef2123456789abcdef3123456789abcdef") | ||
require.NoError(t, err) | ||
res, err := RegularFilePathForValidatedDigest(d) | ||
require.NoError(t, err) | ||
assert.Equal(t, "01/23456789abcdef1123456789abcdef2123456789abcdef3123456789abcdef", res) | ||
|
||
d, err = digest.Parse("sha512:0123456789abcdef1123456789abcdef2123456789abcdef3123456789abcdef0123456789abcdef1123456789abcdef2123456789abcdef3123456789abcdef") | ||
require.NoError(t, err) | ||
_, err = RegularFilePathForValidatedDigest(d) | ||
assert.Error(t, err) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ package chunked | |
|
||
import ( | ||
archivetar "archive/tar" | ||
"bytes" | ||
"context" | ||
"encoding/base64" | ||
"errors" | ||
|
@@ -23,16 +24,20 @@ import ( | |
"github.com/containers/storage/pkg/archive" | ||
"github.com/containers/storage/pkg/chunked/compressor" | ||
"github.com/containers/storage/pkg/chunked/internal" | ||
"github.com/containers/storage/pkg/chunked/internal/composefs" | ||
"github.com/containers/storage/pkg/chunked/toc" | ||
"github.com/containers/storage/pkg/fsverity" | ||
"github.com/containers/storage/pkg/idtools" | ||
"github.com/containers/storage/pkg/system" | ||
securejoin "github.com/cyphar/filepath-securejoin" | ||
jsoniter "github.com/json-iterator/go" | ||
"github.com/klauspost/compress/zstd" | ||
"github.com/klauspost/pgzip" | ||
digest "github.com/opencontainers/go-digest" | ||
"github.com/sirupsen/logrus" | ||
"github.com/vbatts/tar-split/archive/tar" | ||
"github.com/vbatts/tar-split/tar/asm" | ||
tsStorage "github.com/vbatts/tar-split/tar/storage" | ||
"golang.org/x/sys/unix" | ||
) | ||
|
||
|
@@ -1111,10 +1116,13 @@ func (c *chunkedDiffer) findAndCopyFile(dirfd int, r *fileMetadata, copyOptions | |
return false, nil | ||
} | ||
|
||
func makeEntriesFlat(mergedEntries []fileMetadata) ([]fileMetadata, error) { | ||
// makeEntriesFlat collects regular-file entries from mergedEntries, and produces a new list | ||
// where each file content is only represented once, and uses composefs.RegularFilePathForValidatedDigest for its name. | ||
// if flatPathNameMap is not nil, this function writes to it a mapping from filepath.Clean(originalName) to the composefs name. | ||
func makeEntriesFlat(mergedEntries []fileMetadata, flatPathNameMap map[string]string) ([]fileMetadata, error) { | ||
var new []fileMetadata | ||
|
||
hashes := make(map[string]string) | ||
knownFlatPaths := make(map[string]struct{}) | ||
for i := range mergedEntries { | ||
if mergedEntries[i].Type != TypeReg { | ||
continue | ||
|
@@ -1124,16 +1132,22 @@ func makeEntriesFlat(mergedEntries []fileMetadata) ([]fileMetadata, error) { | |
} | ||
digest, err := digest.Parse(mergedEntries[i].Digest) | ||
if err != nil { | ||
return nil, err | ||
return nil, fmt.Errorf("invalid digest %q for %q: %w", mergedEntries[i].Digest, mergedEntries[i].Name, err) | ||
} | ||
path, err := composefs.RegularFilePathForValidatedDigest(digest) | ||
if err != nil { | ||
return nil, fmt.Errorf("determining physical file path for %q: %w", mergedEntries[i].Name, err) | ||
} | ||
if flatPathNameMap != nil { | ||
flatPathNameMap[filepath.Clean(mergedEntries[i].Name)] = path | ||
} | ||
d := digest.Encoded() | ||
|
||
if hashes[d] != "" { | ||
if _, known := knownFlatPaths[path]; known { | ||
continue | ||
} | ||
hashes[d] = d | ||
knownFlatPaths[path] = struct{}{} | ||
|
||
mergedEntries[i].Name = fmt.Sprintf("%s/%s", d[0:2], d[2:]) | ||
mergedEntries[i].Name = path | ||
mergedEntries[i].skipSetAttrs = true | ||
|
||
new = append(new, mergedEntries[i]) | ||
|
@@ -1415,16 +1429,20 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff | |
if err != nil { | ||
return output, &fs.PathError{Op: "open", Path: dest, Err: err} | ||
} | ||
defer unix.Close(dirfd) | ||
dirFile := os.NewFile(uintptr(dirfd), dest) | ||
defer dirFile.Close() | ||
|
||
var flatPathNameMap map[string]string // = nil | ||
if differOpts != nil && differOpts.Format == graphdriver.DifferOutputFormatFlat { | ||
mergedEntries, err = makeEntriesFlat(mergedEntries) | ||
flatPathNameMap = map[string]string{} | ||
mergedEntries, err = makeEntriesFlat(mergedEntries, flatPathNameMap) | ||
if err != nil { | ||
return output, err | ||
} | ||
createdDirs := make(map[string]struct{}) | ||
for _, e := range mergedEntries { | ||
d := e.Name[0:2] | ||
// This hard-codes an assumption that RegularFilePathForValidatedDigest creates paths with exactly one directory component. | ||
d := filepath.Dir(e.Name) | ||
if _, found := createdDirs[d]; !found { | ||
if err := unix.Mkdirat(dirfd, d, 0o755); err != nil { | ||
return output, &fs.PathError{Op: "mkdirat", Path: d, Err: err} | ||
|
@@ -1687,6 +1705,30 @@ func (c *chunkedDiffer) ApplyDiff(dest string, options *archive.TarOptions, diff | |
} | ||
} | ||
|
||
// To ensure that consumers of the layer who decompress and read the full tar stream, | ||
// and consumers who consume the data via the TOC, both see exactly the same data and metadata, | ||
// compute the UncompressedDigest. | ||
// c/image will then ensure that this value matches the value in the image config’s RootFS.DiffID, i.e. the image must commit | ||
// to one UncompressedDigest value for each layer, and that will avoid the ambiguity (in consumers who validate layers against DiffID). | ||
// | ||
// c/image also uses the UncompressedDigest as a layer ID, allowing it to use the traditional layer and image IDs. | ||
// | ||
// This is, sadly, quite costly: Up to now we might have only have had to write, and digest, only the new/modified files. | ||
// Here we need to read, and digest, the whole layer, even if almost all of it was already present locally previously. | ||
// | ||
// Also, layers without a tar-split (estargz layers and old zstd:chunked layers) can't produce an UncompressedDigest that | ||
// matches the expected RootFS.DiffID; they will need to use a special TOC-based layer ID, and we need to somehow (??) | ||
// ensure unambiguity - either by always falling back to full pulls, or by never falling back. | ||
if output.UncompressedDigest == "" && output.TarSplit != nil { | ||
metadata := tsStorage.NewJSONUnpacker(bytes.NewReader(output.TarSplit)) | ||
fg := newStagedFileGetter(dirFile, flatPathNameMap) | ||
digester := digest.Canonical.Digester() | ||
if err := asm.WriteOutputTarStream(fg, metadata, digester.Hash()); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the expensive part where we're basically re-synthesizing the tarball from the unpacked files and the tar split-data just to get the checksum, right? I know this is just a PoC but it seems very much worth a comment at the top of this conditional, something like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment added, discussing more of the goals, costs and tradeoffs. |
||
return output, fmt.Errorf("digesting staged uncompressed stream: %w", err) | ||
} | ||
output.UncompressedDigest = digester.Digest() | ||
} | ||
|
||
if totalChunksSize > 0 { | ||
logrus.Debugf("Missing %d bytes out of %d (%.2f %%)", missingPartsSize, totalChunksSize, float32(missingPartsSize*100.0)/float32(totalChunksSize)) | ||
} | ||
|
@@ -1812,3 +1854,33 @@ func validateChunkChecksum(chunk *internal.FileMetadata, root, path string, offs | |
|
||
return digester.Digest() == digest | ||
} | ||
|
||
// newStagedFileGetter returns an object usable as storage.FileGetter for rootDir. | ||
// if flatPathNameMap is not nil, it must be used to map logical file names into the backing file paths. | ||
func newStagedFileGetter(rootDir *os.File, flatPathNameMap map[string]string) *stagedFileGetter { | ||
return &stagedFileGetter{ | ||
rootDir: rootDir, | ||
flatPathNameMap: flatPathNameMap, | ||
} | ||
} | ||
|
||
type stagedFileGetter struct { | ||
rootDir *os.File | ||
flatPathNameMap map[string]string // nil, or a map from filepath.Clean()ed tar file names to expected on-filesystem names | ||
} | ||
|
||
func (fg *stagedFileGetter) Get(filename string) (io.ReadCloser, error) { | ||
if fg.flatPathNameMap != nil { | ||
path, ok := fg.flatPathNameMap[filepath.Clean(filename)] | ||
if !ok { | ||
return nil, fmt.Errorf("no path mapping exists for tar entry %q", filename) | ||
} | ||
filename = path | ||
} | ||
pathFD, err := securejoin.OpenatInRoot(fg.rootDir, filename) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (A little surprising securejoin doesn't have a high level There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (We have an |
||
if err != nil { | ||
return nil, err | ||
} | ||
defer pathFD.Close() | ||
return securejoin.Reopen(pathFD, unix.O_RDONLY) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@giuseppe @cgwalters note this.
zstd:chunked layers have tar-split in the current format since b5413c2 , i.e. c/storage 1.54 == Podman 5.1.0.
I’m not 100% sure what to do here. I’m leaning towards outright refusing to partially pull estargz and old chunked layers (falling back to full pull), that is “clearly correct” and always possible. (Users who insist on always using partial pulls, e.g. for composefs, can rebuild their images.) Alternatively, we could always do a partial pull and refuse a fallback, but then we would need to break pulling those layers on other graph drivers (or, eventually, implement partial pulls for those graph drivers) — and we still would have the signing ambiguity.