layers.go

package storage

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"maps"
	"os"
	"path"
	"path/filepath"
	"reflect"
	"slices"
	"sort"
	"strings"
	"sync"
	"time"

	drivers "github.com/containers/storage/drivers"
	"github.com/containers/storage/pkg/archive"
	"github.com/containers/storage/pkg/idtools"
	"github.com/containers/storage/pkg/ioutils"
	"github.com/containers/storage/pkg/lockfile"
	"github.com/containers/storage/pkg/mount"
	"github.com/containers/storage/pkg/stringid"
	"github.com/containers/storage/pkg/system"
	"github.com/containers/storage/pkg/tarlog"
	"github.com/containers/storage/pkg/truncindex"
	multierror "github.com/hashicorp/go-multierror"
	"github.com/klauspost/pgzip"
	digest "github.com/opencontainers/go-digest"
	"github.com/opencontainers/selinux/go-selinux"
	"github.com/sirupsen/logrus"
	"github.com/vbatts/tar-split/archive/tar"
	"github.com/vbatts/tar-split/tar/asm"
	"github.com/vbatts/tar-split/tar/storage"
)

const (
	tarSplitSuffix = ".tar-split.gz"
	incompleteFlag = "incomplete"
	// maxLayerStoreCleanupIterations is the number of times we try to clean up inconsistent layer store state
	// in readers (which, for implementation reasons, gives other writers the opportunity to create more inconsistent state)
	// until we just give up.
	maxLayerStoreCleanupIterations = 3
)

type layerLocations uint8

// The backing store is split in two json files, one (the volatile)
// that is written without fsync() meaning it isn't as robust to
// unclean shutdown
const (
	stableLayerLocation layerLocations = 1 << iota
	volatileLayerLocation

	numLayerLocationIndex = iota
)

func layerLocationFromIndex(index int) layerLocations {
	return 1 << index
}

// A Layer is a record of a copy-on-write layer that's stored by the lower
// level graph driver.
type Layer struct {
	// ID is either one which was specified at create-time, or a random
	// value which was generated by the library.
	ID string `json:"id"`

	// Names is an optional set of user-defined convenience values.  The
	// layer can be referred to by its ID or any of its names.  Names are
	// unique among layers.
	Names []string `json:"names,omitempty"`

	// Parent is the ID of a layer from which this layer inherits data.
	Parent string `json:"parent,omitempty"`

	// Metadata is data we keep for the convenience of the caller.  It is not
	// expected to be large, since it is kept in memory.
	Metadata string `json:"metadata,omitempty"`

	// MountLabel is an SELinux label which should be used when attempting to mount
	// the layer.
	MountLabel string `json:"mountlabel,omitempty"`

	// MountPoint is the path where the layer is mounted, or where it was most
	// recently mounted.
	//
	// WARNING: This field is a snapshot in time: (except for users inside c/storage that
	// hold the mount lock) the true value can change between subsequent
	// calls to c/storage API.
	//
	// Users that need to handle concurrent mount/unmount attempts should not access this
	// field at all, and should only use the path returned by .Mount() (and that’s only
	// assuming no other user will concurrently decide to unmount that mount point).
	MountPoint string `json:"-"`

	// MountCount is used as a reference count for the container's layer being
	// mounted at the mount point.
	//
	// WARNING: This field is a snapshot in time; (except for users inside c/storage that
	// hold the mount lock) the true value can change between subsequent
	// calls to c/storage API.
	//
	// In situations where concurrent mount/unmount attempts can happen, this field
	// should not be used for any decisions, maybe apart from heuristic user warnings.
	MountCount int `json:"-"`

	// Created is the datestamp for when this layer was created.  Older
	// versions of the library did not track this information, so callers
	// will likely want to use the IsZero() method to verify that a value
	// is set before using it.
	Created time.Time `json:"created,omitempty"`

	// CompressedDigest is the digest of the blob that was last passed to
	// ApplyDiff() or create(), as it was presented to us.
	CompressedDigest digest.Digest `json:"compressed-diff-digest,omitempty"`

	// CompressedSize is the length of the blob that was last passed to
	// ApplyDiff() or create(), as it was presented to us.  If
	// CompressedDigest is not set, this should be treated as if it were an
	// uninitialized value.
	CompressedSize int64 `json:"compressed-size,omitempty"`

	// UncompressedDigest is the digest of the blob that was last passed to
	// ApplyDiff() or create(), after we decompressed it.  Often referred to
	// as a DiffID.
	UncompressedDigest digest.Digest `json:"diff-digest,omitempty"`

	// TOCDigest represents the digest of the Table of Contents (TOC) of the blob.
	// This digest is utilized when the UncompressedDigest is not
	// validated during the partial image pull process, but the
	// TOC itself is validated.
	// It serves as an alternative reference under these specific conditions.
	TOCDigest digest.Digest `json:"toc-digest,omitempty"`

	// UncompressedSize is the length of the blob that was last passed to
	// ApplyDiff() or create(), after we decompressed it.
	//
	//   - If UncompressedDigest is set, this must be set to a valid value.
	//   - Otherwise, if TOCDigest is set, this is either valid or -1.
	//   - If neither of this digests is set, this should be treated as if it were
	//     an uninitialized value.
	UncompressedSize int64 `json:"diff-size,omitempty"`

	// CompressionType is the type of compression which we detected on the blob
	// that was last passed to ApplyDiff() or create().
	CompressionType archive.Compression `json:"compression,omitempty"`

	// UIDs and GIDs are lists of UIDs and GIDs used in the layer.  This
	// field is only populated (i.e., will only contain one or more
	// entries) if the layer was created using ApplyDiff() or create().
	UIDs []uint32 `json:"uidset,omitempty"`
	GIDs []uint32 `json:"gidset,omitempty"`

	// Flags is arbitrary data about the layer.
	Flags map[string]interface{} `json:"flags,omitempty"`

	// UIDMap and GIDMap are used for setting up a layer's contents
	// for use inside of a user namespace where UID mapping is being used.
	UIDMap []idtools.IDMap `json:"uidmap,omitempty"`
	GIDMap []idtools.IDMap `json:"gidmap,omitempty"`

	// ReadOnly is true if this layer resides in a read-only layer store.
	ReadOnly bool `json:"-"`

	// volatileStore is true if the container is from the volatile json file
	volatileStore bool `json:"-"`

	// BigDataNames is a list of names of data items that we keep for the
	// convenience of the caller.  They can be large, and are only in
	// memory when being read from or written to disk.
	BigDataNames []string `json:"big-data-names,omitempty"`
}

type layerMountPoint struct {
	ID         string `json:"id"`
	MountPoint string `json:"path"`
	MountCount int    `json:"count"`
}

// DiffOptions override the default behavior of Diff() methods.
type DiffOptions struct {
	// Compression, if set overrides the default compressor when generating a diff.
	Compression *archive.Compression
}

// stagedLayerOptions are the options passed to .create to populate a staged
// layer
type stagedLayerOptions struct {
	DiffOutput  *drivers.DriverWithDifferOutput
	DiffOptions *drivers.ApplyDiffWithDifferOpts
}

// roLayerStore wraps a graph driver, adding the ability to refer to layers by
// name, and keeping track of parent-child relationships, along with a list of
// all known layers.
type roLayerStore interface {
	roMetadataStore
	roLayerBigDataStore

	// startReading makes sure the store is fresh, and locks it for reading.
	// If this succeeds, the caller MUST call stopReading().
	startReading() error

	// stopReading releases locks obtained by startReading.
	stopReading()

	// Exists checks if a layer with the specified name or ID is known.
	Exists(id string) bool

	// Get retrieves information about a layer given an ID or name.
	Get(id string) (*Layer, error)

	// Status returns an slice of key-value pairs, suitable for human consumption,
	// relaying whatever status information the underlying driver can share.
	Status() ([][2]string, error)

	// Changes returns a slice of Change structures, which contain a pathname
	// (Path) and a description of what sort of change (Kind) was made by the
	// layer (either ChangeModify, ChangeAdd, or ChangeDelete), relative to a
	// specified layer.  By default, the layer's parent is used as a reference.
	Changes(from, to string) ([]archive.Change, error)

	// Diff produces a tarstream which can be applied to a layer with the contents
	// of the first layer to produce a layer with the contents of the second layer.
	// By default, the parent of the second layer is used as the first
	// layer, so it need not be specified.  Options can be used to override
	// default behavior, but are also not required.
	Diff(from, to string, options *DiffOptions) (io.ReadCloser, error)

	// DiffSize produces an estimate of the length of the tarstream which would be
	// produced by Diff.
	DiffSize(from, to string) (int64, error)

	// Size produces a cached value for the uncompressed size of the layer,
	// if one is known, or -1 if it is not known.  If the layer can not be
	// found, it returns an error.
	Size(name string) (int64, error)

	// LayersByCompressedDigest returns a slice of the layers with the
	// specified compressed digest value recorded for them.
	LayersByCompressedDigest(d digest.Digest) ([]Layer, error)

	// LayersByUncompressedDigest returns a slice of the layers with the
	// specified uncompressed digest value recorded for them.
	LayersByUncompressedDigest(d digest.Digest) ([]Layer, error)

	// LayersByTOCDigest returns a slice of the layers with the
	// specified uncompressed digest value recorded for them.
	LayersByTOCDigest(d digest.Digest) ([]Layer, error)

	// Layers returns a slice of the known layers.
	Layers() ([]Layer, error)
}

// rwLayerStore wraps a graph driver, adding the ability to refer to layers by
// name, and keeping track of parent-child relationships, along with a list of
// all known layers.
type rwLayerStore interface {
	roLayerStore
	rwMetadataStore
	flaggableStore
	rwLayerBigDataStore

	// startWriting makes sure the store is fresh, and locks it for writing.
	// If this succeeds, the caller MUST call stopWriting().
	startWriting() error

	// stopWriting releases locks obtained by startWriting.
	stopWriting()

	// create creates a new layer, optionally giving it a specified ID rather than
	// a randomly-generated one, either inheriting data from another specified
	// layer or the empty base layer.  The new layer can optionally be given names
	// and have an SELinux label specified for use when mounting it.  Some
	// underlying drivers can accept a "size" option.  At this time, most
	// underlying drivers do not themselves distinguish between writeable
	// and read-only layers.  Returns the new layer structure and the size of the
	// diff which was applied to its parent to initialize its contents.
	create(id string, parent *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, diff io.Reader, slo *stagedLayerOptions) (*Layer, int64, error)

	// updateNames modifies names associated with a layer based on (op, names).
	updateNames(id string, names []string, op updateNameOperation) error

	// Delete deletes a layer with the specified name or ID.
	Delete(id string) error

	// Wipe deletes all layers.
	Wipe() error

	// Mount mounts a layer for use.  If the specified layer is the parent of other
	// layers, it should not be written to.  An SELinux label to be applied to the
	// mount can be specified to override the one configured for the layer.
	// The mappings used by the container can be specified.
	Mount(id string, options drivers.MountOpts) (string, error)

	// unmount unmounts a layer when it is no longer in use.
	// If conditional is set, it will fail with ErrLayerNotMounted if the layer is not mounted (without conditional, the caller is
	// making a promise that the layer is actually mounted).
	// If force is set, it will physically try to unmount it even if it is mounted multiple times, or even if (!conditional and)
	// there are no records of it being mounted in the first place.
	// It returns whether the layer was still mounted at the time this function returned.
	// WARNING: The return value may already be obsolete by the time it is available
	// to the caller, so it can be used for heuristic sanity checks at best. It should almost always be ignored.
	unmount(id string, force bool, conditional bool) (bool, error)

	// Mounted returns number of times the layer has been mounted.
	Mounted(id string) (int, error)

	// ParentOwners returns the UIDs and GIDs of parents of the layer's mountpoint
	// for which the layer's UID and GID maps don't contain corresponding entries.
	ParentOwners(id string) (uids, gids []int, err error)

	// ApplyDiff reads a tarstream which was created by a previous call to Diff and
	// applies its changes to a specified layer.
	ApplyDiff(to string, diff io.Reader) (int64, error)

	// applyDiffWithDifferNoLock applies the changes through the differ callback function.
	applyDiffWithDifferNoLock(options *drivers.ApplyDiffWithDifferOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error)

	// CleanupStagingDirectory cleanups the staging directory.  It can be used to cleanup the staging directory on errors
	CleanupStagingDirectory(stagingDirectory string) error

	// applyDiffFromStagingDirectory uses diffOutput.Target to create the diff.
	applyDiffFromStagingDirectory(id string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffWithDifferOpts) error

	// DifferTarget gets the location where files are stored for the layer.
	DifferTarget(id string) (string, error)

	// PutAdditionalLayer creates a layer using the diff contained in the additional layer
	// store.
	// This API is experimental and can be changed without bumping the major version number.
	PutAdditionalLayer(id string, parentLayer *Layer, names []string, aLayer drivers.AdditionalLayer) (layer *Layer, err error)

	// Clean up unreferenced layers
	GarbageCollect() error
}

type multipleLockFile struct {
	lockfiles []*lockfile.LockFile
}

func (l multipleLockFile) Lock() {
	for _, lock := range l.lockfiles {
		lock.Lock()
	}
}

func (l multipleLockFile) RLock() {
	for _, lock := range l.lockfiles {
		lock.RLock()
	}
}

func (l multipleLockFile) Unlock() {
	for _, lock := range l.lockfiles {
		lock.Unlock()
	}
}

func (l multipleLockFile) ModifiedSince(lastWrite lockfile.LastWrite) (lockfile.LastWrite, bool, error) {
	// Look up only the first lockfile, since this is the value returned by RecordWrite().
	return l.lockfiles[0].ModifiedSince(lastWrite)
}

func (l multipleLockFile) AssertLockedForWriting() {
	for _, lock := range l.lockfiles {
		lock.AssertLockedForWriting()
	}
}

func (l multipleLockFile) GetLastWrite() (lockfile.LastWrite, error) {
	return l.lockfiles[0].GetLastWrite()
}

func (l multipleLockFile) RecordWrite() (lockfile.LastWrite, error) {
	var lastWrite *lockfile.LastWrite
	for _, lock := range l.lockfiles {
		lw, err := lock.RecordWrite()
		if err != nil {
			return lw, err
		}
		// Return the first value we get so we know that
		// all the locks have a write time >= to this one.
		if lastWrite == nil {
			lastWrite = &lw
		}
	}
	return *lastWrite, nil
}

func (l multipleLockFile) IsReadWrite() bool {
	return l.lockfiles[0].IsReadWrite()
}

func newMultipleLockFile(l ...*lockfile.LockFile) *multipleLockFile {
	return &multipleLockFile{lockfiles: l}
}

type layerStore struct {
	// The following fields are only set when constructing layerStore, and must never be modified afterwards.
	// They are safe to access without any other locking.
	lockfile       *multipleLockFile  // lockfile.IsReadWrite can be used to distinguish between read-write and read-only layer stores.
	mountsLockfile *lockfile.LockFile // Can _only_ be obtained with inProcessLock held.
	rundir         string
	jsonPath       [numLayerLocationIndex]string
	layerdir       string

	inProcessLock sync.RWMutex // Can _only_ be obtained with lockfile held.
	// The following fields can only be read/written with read/write ownership of inProcessLock, respectively.
	// Almost all users should use startReading() or startWriting().
	lastWrite           lockfile.LastWrite
	mountsLastWrite     lockfile.LastWrite // Only valid if lockfile.IsReadWrite()
	layers              []*Layer
	idindex             *truncindex.TruncIndex
	byid                map[string]*Layer
	byname              map[string]*Layer
	bymount             map[string]*Layer
	bycompressedsum     map[digest.Digest][]string
	byuncompressedsum   map[digest.Digest][]string
	bytocsum            map[digest.Digest][]string
	layerspathsModified [numLayerLocationIndex]time.Time

	// FIXME: This field is only set when constructing layerStore, but locking rules of the driver
	// interface itself are not documented here.
	driver drivers.Driver
}

// The caller must hold r.inProcessLock for reading.
func layerLocation(l *Layer) layerLocations {
	if l.volatileStore {
		return volatileLayerLocation
	}
	return stableLayerLocation
}

func copyLayer(l *Layer) *Layer {
	return &Layer{
		ID:                 l.ID,
		Names:              copySlicePreferringNil(l.Names),
		Parent:             l.Parent,
		Metadata:           l.Metadata,
		MountLabel:         l.MountLabel,
		MountPoint:         l.MountPoint,
		MountCount:         l.MountCount,
		Created:            l.Created,
		CompressedDigest:   l.CompressedDigest,
		CompressedSize:     l.CompressedSize,
		UncompressedDigest: l.UncompressedDigest,
		UncompressedSize:   l.UncompressedSize,
		TOCDigest:          l.TOCDigest,
		CompressionType:    l.CompressionType,
		ReadOnly:           l.ReadOnly,
		volatileStore:      l.volatileStore,
		BigDataNames:       copySlicePreferringNil(l.BigDataNames),
		Flags:              copyMapPreferringNil(l.Flags),
		UIDMap:             copySlicePreferringNil(l.UIDMap),
		GIDMap:             copySlicePreferringNil(l.GIDMap),
		UIDs:               copySlicePreferringNil(l.UIDs),
		GIDs:               copySlicePreferringNil(l.GIDs),
	}
}

// startWritingWithReload makes sure the store is fresh if canReload, and locks it for writing.
// If this succeeds, the caller MUST call stopWriting().
//
// This is an internal implementation detail of layerStore construction, every other caller
// should use startWriting() instead.
func (r *layerStore) startWritingWithReload(canReload bool) error {
	r.lockfile.Lock()
	r.inProcessLock.Lock()
	succeeded := false
	defer func() {
		if !succeeded {
			r.inProcessLock.Unlock()
			r.lockfile.Unlock()
		}
	}()

	if canReload {
		if _, err := r.reloadIfChanged(true); err != nil {
			return err
		}
	}

	succeeded = true
	return nil
}

// startWriting makes sure the store is fresh, and locks it for writing.
// If this succeeds, the caller MUST call stopWriting().
func (r *layerStore) startWriting() error {
	return r.startWritingWithReload(true)
}

// stopWriting releases locks obtained by startWriting.
func (r *layerStore) stopWriting() {
	r.inProcessLock.Unlock()
	r.lockfile.Unlock()
}

// startReadingWithReload makes sure the store is fresh if canReload, and locks it for reading.
// If this succeeds, the caller MUST call stopReading().
//
// This is an internal implementation detail of layerStore construction, every other caller
// should use startReading() instead.
func (r *layerStore) startReadingWithReload(canReload bool) error {
	// inProcessLocked calls the nested function with r.inProcessLock held for writing.
	inProcessLocked := func(fn func() error) error {
		r.inProcessLock.Lock()
		defer r.inProcessLock.Unlock()
		return fn()
	}

	r.lockfile.RLock()
	unlockFn := r.lockfile.Unlock // A function to call to clean up, or nil
	defer func() {
		if unlockFn != nil {
			unlockFn()
		}
	}()
	r.inProcessLock.RLock()
	unlockFn = r.stopReading

	if canReload {
		// If we are lucky, we can just hold the read locks, check that we are fresh, and continue.
		modified, err := r.modified()
		if err != nil {
			return err
		}
		if modified {
			// We are unlucky, and need to reload.
			// NOTE: Multiple goroutines can get to this place approximately simultaneously.
			r.inProcessLock.RUnlock()
			unlockFn = r.lockfile.Unlock

			cleanupsDone := 0
			for {
				// First try reloading with r.lockfile held for reading.
				// r.inProcessLock will serialize all goroutines that got here;
				// each will re-check on-disk state vs. r.lastWrite, and the first one will actually reload the data.
				var tryLockedForWriting bool
				err := inProcessLocked(func() error {
					var err error
					tryLockedForWriting, err = r.reloadIfChanged(false)
					return err
				})
				if err == nil {
					break
				}
				if !tryLockedForWriting {
					return err
				}
				if cleanupsDone >= maxLayerStoreCleanupIterations {
					return fmt.Errorf("(even after %d cleanup attempts:) %w", cleanupsDone, err)
				}
				// Not good enough, we need r.lockfile held for writing. So, let’s do that.
				unlockFn()
				unlockFn = nil

				r.lockfile.Lock()
				unlockFn = r.lockfile.Unlock
				if err := inProcessLocked(func() error {
					_, err := r.reloadIfChanged(true)
					return err
				}); err != nil {
					return err
				}
				unlockFn()
				unlockFn = nil

				r.lockfile.RLock()
				unlockFn = r.lockfile.Unlock
				// We need to check for a reload again because the on-disk state could have been modified
				// after we released the lock.
				cleanupsDone++
			}

			// NOTE that we hold neither a read nor write inProcessLock at this point. That’s fine in ordinary operation, because
			// the on-filesystem r.lockfile should protect us against (cooperating) writers, and any use of r.inProcessLock
			// protects us against in-process writers modifying data.
			// In presence of non-cooperating writers, we just ensure that 1) the in-memory data is not clearly out-of-date
			// and 2) access to the in-memory data is not racy;
			// but we can’t protect against those out-of-process writers modifying _files_ while we are assuming they are in a consistent state.

			r.inProcessLock.RLock()
		}
	}

	unlockFn = nil
	return nil
}

// startReading makes sure the store is fresh, and locks it for reading.
// If this succeeds, the caller MUST call stopReading().
func (r *layerStore) startReading() error {
	return r.startReadingWithReload(true)
}

// stopReading releases locks obtained by startReading.
func (r *layerStore) stopReading() {
	r.inProcessLock.RUnlock()
	r.lockfile.Unlock()
}

// modified returns true if the on-disk state (of layers or mounts) has changed (ie if reloadIcHanged may need to modify the store)
//
// Note that unlike containerStore.modified and imageStore.modified, this function is not directly used in layerStore.reloadIfChanged();
// it exists only to help the reader ensure it has fresh enough state.
//
// The caller must hold r.lockfile for reading _or_ writing.
// The caller must hold r.inProcessLock for reading or writing.
func (r *layerStore) modified() (bool, error) {
	_, m, err := r.layersModified()
	if err != nil {
		return false, err
	}
	if m {
		return true, nil
	}
	if r.lockfile.IsReadWrite() {
		// This means we get, release, and re-obtain, r.mountsLockfile if we actually need to do any kind of reload.
		// That’s a bit expensive, but hopefully most callers will be read-only and see no changes.
		// We can’t eliminate these mountsLockfile accesses given the current assumption that Layer objects have _some_ not-very-obsolete
		// mount data. Maybe we can segregate the mount-dependent and mount-independent operations better...
		r.mountsLockfile.RLock()
		defer r.mountsLockfile.Unlock()
		_, m, err := r.mountsModified()
		if err != nil {
			return false, err
		}
		if m {
			return true, nil
		}
	}
	return false, nil
}

// layersModified() checks if the most recent writer to r.jsonPath[] was a party other than the
// last recorded writer. If so, it returns a lockfile.LastWrite value to record on a successful
// reload.
// It should only be called with the lock held.
// The caller must hold r.inProcessLock for reading.
func (r *layerStore) layersModified() (lockfile.LastWrite, bool, error) {
	lastWrite, modified, err := r.lockfile.ModifiedSince(r.lastWrite)
	if err != nil {
		return lockfile.LastWrite{}, modified, err
	}
	if modified {
		return lastWrite, true, nil
	}

	// If the layers.json file or container-layers.json has been
	// modified manually, then we have to reload the storage in
	// any case.
	for locationIndex := 0; locationIndex < numLayerLocationIndex; locationIndex++ {
		info, err := os.Stat(r.jsonPath[locationIndex])
		if err != nil && !os.IsNotExist(err) {
			return lockfile.LastWrite{}, false, fmt.Errorf("stat layers file: %w", err)
		}
		if info != nil && info.ModTime() != r.layerspathsModified[locationIndex] {
			// In this case the LastWrite value is equal to r.lastWrite; writing it back doesn’t hurt.
			return lastWrite, true, nil
		}
	}

	return lockfile.LastWrite{}, false, nil
}

// reloadIfChanged reloads the contents of the store from disk if it is changed.
//
// The caller must hold r.lockfile for reading _or_ writing; lockedForWriting is true
// if it is held for writing.
//
// The caller must hold r.inProcessLock for WRITING.
//
// If !lockedForWriting and this function fails, the return value indicates whether
// reloadIfChanged() with lockedForWriting could succeed.
func (r *layerStore) reloadIfChanged(lockedForWriting bool) (bool, error) {
	lastWrite, layersModified, err := r.layersModified()
	if err != nil {
		return false, err
	}
	if layersModified {
		// r.load also reloads mounts data; so, on this path, we don’t need to call reloadMountsIfChanged.
		if tryLockedForWriting, err := r.load(lockedForWriting); err != nil {
			return tryLockedForWriting, err // r.lastWrite is unchanged, so we will load the next time again.
		}
		r.lastWrite = lastWrite
		return false, nil
	}
	if r.lockfile.IsReadWrite() {
		r.mountsLockfile.RLock()
		defer r.mountsLockfile.Unlock()
		if err := r.reloadMountsIfChanged(); err != nil {
			return false, err
		}
	}
	return false, nil
}

// mountsModified returns true if the on-disk mount state has changed (i.e. if reloadMountsIfChanged may need to modify the store),
// and a lockfile.LastWrite value for that update.
//
// The caller must hold r.mountsLockfile for reading _or_ writing.
// The caller must hold r.inProcessLock for reading or writing.
func (r *layerStore) mountsModified() (lockfile.LastWrite, bool, error) {
	return r.mountsLockfile.ModifiedSince(r.mountsLastWrite)
}

// reloadMountsIfChanged reloads the contents of mountsPath from disk if it is changed.
//
// The caller must hold r.mountsLockFile for reading or writing.
func (r *layerStore) reloadMountsIfChanged() error {
	lastWrite, modified, err := r.mountsModified()
	if err != nil {
		return err
	}
	if modified {
		if err = r.loadMounts(); err != nil {
			return err
		}
		r.mountsLastWrite = lastWrite
	}
	return nil
}

// Requires startReading or startWriting.
func (r *layerStore) Layers() ([]Layer, error) {
	layers := make([]Layer, len(r.layers))
	for i := range r.layers {
		layers[i] = *copyLayer(r.layers[i])
	}
	return layers, nil
}

// Requires startWriting.
func (r *layerStore) GarbageCollect() error {
	layers, err := r.driver.ListLayers()
	if err != nil {
		if errors.Is(err, drivers.ErrNotSupported) {
			return nil
		}
		return err
	}

	for _, id := range layers {
		// Is the id still referenced
		if r.byid[id] != nil {
			continue
		}

		// Remove layer and any related data of unreferenced id
		if err := r.driver.Remove(id); err != nil {
			logrus.Debugf("removing driver layer %q", id)
			return err
		}

		logrus.Debugf("removing %q", r.tspath(id))
		os.Remove(r.tspath(id))
		logrus.Debugf("removing %q", r.datadir(id))
		os.RemoveAll(r.datadir(id))
	}
	return nil
}

func (r *layerStore) mountspath() string {
	return filepath.Join(r.rundir, "mountpoints.json")
}

// load reloads the contents of the store from disk.
//
// Most callers should call reloadIfChanged() instead, to avoid overhead and to correctly
// manage r.lastWrite.
//
// As a side effect, this sets r.mountsLastWrite.
//
// The caller must hold r.lockfile for reading _or_ writing; lockedForWriting is true
// if it is held for writing.
// The caller must hold r.inProcessLock for WRITING.
//
// If !lockedForWriting and this function fails, the return value indicates whether
// retrying with lockedForWriting could succeed.
func (r *layerStore) load(lockedForWriting bool) (bool, error) {
	var modifiedLocations layerLocations

	layers := []*Layer{}
	ids := make(map[string]*Layer)

	for locationIndex := 0; locationIndex < numLayerLocationIndex; locationIndex++ {
		location := layerLocationFromIndex(locationIndex)
		rpath := r.jsonPath[locationIndex]
		info, err := os.Stat(rpath)
		if err != nil {
			if !os.IsNotExist(err) {
				return false, err
			}
		} else {
			r.layerspathsModified[locationIndex] = info.ModTime()
		}
		data, err := os.ReadFile(rpath)
		if err != nil && !os.IsNotExist(err) {
			return false, err
		}

		locationLayers := []*Layer{}
		if len(data) != 0 {
			if err := json.Unmarshal(data, &locationLayers); err != nil {
				return false, fmt.Errorf("loading %q: %w", rpath, err)
			}
		}

		for _, layer := range locationLayers {
			// There should be no duplicated ids between json files, but lets check to be sure
			if ids[layer.ID] != nil {
				continue // skip invalid duplicated layer
			}
			// Remember where the layer came from
			if location == volatileLayerLocation {
				layer.volatileStore = true
			}
			layers = append(layers, layer)
			ids[layer.ID] = layer
		}
	}

	idlist := make([]string, 0, len(layers))
	names := make(map[string]*Layer)
	compressedsums := make(map[digest.Digest][]string)
	uncompressedsums := make(map[digest.Digest][]string)
	tocsums := make(map[digest.Digest][]string)
	var errorToResolveBySaving error // == nil; if there are multiple errors, this is one of them.
	if r.lockfile.IsReadWrite() {
		selinux.ClearLabels()
	}
	for n, layer := range layers {
		idlist = append(idlist, layer.ID)
		for _, name := range layer.Names {
			if conflict, ok := names[name]; ok {
				r.removeName(conflict, name)
				errorToResolveBySaving = ErrDuplicateLayerNames
				modifiedLocations |= layerLocation(conflict)
			}
			names[name] = layers[n]
		}
		if layer.CompressedDigest != "" {
			compressedsums[layer.CompressedDigest] = append(compressedsums[layer.CompressedDigest], layer.ID)
		}
		if layer.UncompressedDigest != "" {
			uncompressedsums[layer.UncompressedDigest] = append(uncompressedsums[layer.UncompressedDigest], layer.ID)
		}
		if layer.TOCDigest != "" {
			tocsums[layer.TOCDigest] = append(tocsums[layer.TOCDigest], layer.ID)
		}
		if layer.MountLabel != "" {
			selinux.ReserveLabel(layer.MountLabel)
		}
		layer.ReadOnly = !r.lockfile.IsReadWrite()
		// The r.lockfile.IsReadWrite() condition maintains past practice:
		// Incomplete layers in a read-only store are not treated as a reason to refuse to use other layers from that store
		// (OTOH creating child layers on top would probably lead to problems?).
		// We do remove incomplete layers in read-write stores so that we don’t build on top of them.
		if layerHasIncompleteFlag(layer) && r.lockfile.IsReadWrite() {
			errorToResolveBySaving = errors.New("an incomplete layer exists and can't be cleaned up")
		}
	}

	if errorToResolveBySaving != nil {
		if !r.lockfile.IsReadWrite() {
			return false, errorToResolveBySaving
		}
		if !lockedForWriting {
			return true, errorToResolveBySaving
		}
	}
	r.layers = layers
	r.idindex = truncindex.NewTruncIndex(idlist) // Invalid values in idlist are ignored: they are not a reason to refuse processing the whole store.
	r.byid = ids
	r.byname = names
	r.bycompressedsum = compressedsums
	r.byuncompressedsum = uncompressedsums
	r.bytocsum = tocsums

	// Load and merge information about which layers are mounted, and where.
	if r.lockfile.IsReadWrite() {
		r.mountsLockfile.RLock()
		defer r.mountsLockfile.Unlock()
		// We need to reload mounts unconditionally, becuause by creating r.layers from scratch, we have discarded the previous
		// information, if any. So, obtain a fresh mountsLastWrite value so that we don’t unnecessarily reload the data
		// afterwards.
		mountsLastWrite, err := r.mountsLockfile.GetLastWrite()
		if err != nil {
			return false, err
		}
		if err := r.loadMounts(); err != nil {
			return false, err
		}
		r.mountsLastWrite = mountsLastWrite
		// NOTE: We will release mountsLockfile when this function returns, so unlike most of the layer data, the
		// r.layers[].MountPoint, r.layers[].MountCount, and r.bymount values might not reflect
		// true on-filesystem state already by the time this function returns.
		// Code that needs the state to be accurate must lock r.mountsLockfile again,
		// and possibly loadMounts() again.
	}

	if errorToResolveBySaving != nil {
		if !r.lockfile.IsReadWrite() {
			return false, fmt.Errorf("internal error: layerStore.load has shouldSave but !r.lockfile.IsReadWrite")
		}
		// Last step: try to remove anything that a previous
		// user of this storage area marked for deletion but didn't manage to
		// actually delete.
		var incompleteDeletionErrors error // = nil
		for _, layer := range r.layers {
			if layer.Flags == nil {
				layer.Flags = make(map[string]interface{})
			}
			if layerHasIncompleteFlag(layer) {
				logrus.Warnf("Found incomplete layer %#v, deleting it", layer.ID)
				err := r.deleteInternal(layer.ID)
				if err != nil {
					// Don't return the error immediately, because deleteInternal does not saveLayers();
					// Even if deleting one incomplete layer fails, call saveLayers() so that other possible successfully
					// deleted incomplete layers have their metadata correctly removed.
					incompleteDeletionErrors = multierror.Append(incompleteDeletionErrors,
						fmt.Errorf("deleting layer %#v: %w", layer.ID, err))
				}
				modifiedLocations |= layerLocation(layer)
			}
		}
		if err := r.saveLayers(modifiedLocations); err != nil {
			return false, err
		}
		if incompleteDeletionErrors != nil {
			return false, incompleteDeletionErrors
		}
	}
	return false, nil
}

// The caller must hold r.mountsLockfile for reading or writing.
// The caller must hold r.inProcessLock for WRITING.
func (r *layerStore) loadMounts() error {
	mounts := make(map[string]*Layer)
	mpath := r.mountspath()
	data, err := os.ReadFile(mpath)
	if err != nil && !os.IsNotExist(err) {
		return err
	}
	layerMounts := []layerMountPoint{}
	if len(data) != 0 {
		if err := json.Unmarshal(data, &layerMounts); err != nil {
			return err
		}
	}
	// Clear all of our mount information.  If another process
	// unmounted something, it (along with its zero count) won't
	// have been encoded into the version of mountpoints.json that
	// we're loading, so our count could fall out of sync with it
	// if we don't, and if we subsequently change something else,
	// we'd pass that error along to other process that reloaded
	// the data after we saved it.
	for _, layer := range r.layers {
		layer.MountPoint = ""
		layer.MountCount = 0
	}
	// All of the non-zero count values will have been encoded, so
	// we reset the still-mounted ones based on the contents.
	for _, mount := range layerMounts {
		if mount.MountPoint != "" {
			if layer, ok := r.lookup(mount.ID); ok {
				mounts[mount.MountPoint] = layer
				layer.MountPoint = mount.MountPoint
				layer.MountCount = mount.MountCount
			}
		}
	}
	r.bymount = mounts
	return nil
}

// save saves the contents of the store to disk.
// The caller must hold r.lockfile locked for writing.
// The caller must hold r.inProcessLock for WRITING.
func (r *layerStore) save(saveLocations layerLocations) error {
	r.mountsLockfile.Lock()
	defer r.mountsLockfile.Unlock()
	if err := r.saveLayers(saveLocations); err != nil {
		return err
	}
	return r.saveMounts()
}

// saveFor saves the contents of the store relevant for modifiedLayer to disk.
// The caller must hold r.lockfile locked for writing.
// The caller must hold r.inProcessLock for WRITING.
func (r *layerStore) saveFor(modifiedLayer *Layer) error {
	return r.save(layerLocation(modifiedLayer))
}

// The caller must hold r.lockfile locked for writing.
// The caller must hold r.inProcessLock for WRITING.
func (r *layerStore) saveLayers(saveLocations layerLocations) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to modify the layer store at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	r.lockfile.AssertLockedForWriting()

	// This must be done before we write the file, because the process could be terminated
	// after the file is written but before the lock file is updated.
	lw, err := r.lockfile.RecordWrite()
	if err != nil {
		return err
	}
	r.lastWrite = lw

	for locationIndex := 0; locationIndex < numLayerLocationIndex; locationIndex++ {
		location := layerLocationFromIndex(locationIndex)
		if location&saveLocations == 0 {
			continue
		}
		rpath := r.jsonPath[locationIndex]
		if err := os.MkdirAll(filepath.Dir(rpath), 0o700); err != nil {
			return err
		}
		subsetLayers := make([]*Layer, 0, len(r.layers))
		for _, layer := range r.layers {
			if layerLocation(layer) == location {
				subsetLayers = append(subsetLayers, layer)
			}
		}

		jldata, err := json.Marshal(&subsetLayers)
		if err != nil {
			return err
		}
		opts := ioutils.AtomicFileWriterOptions{}
		if location == volatileLayerLocation {
			opts.NoSync = true
		}
		if err := ioutils.AtomicWriteFileWithOpts(rpath, jldata, 0o600, &opts); err != nil {
			return err
		}
		r.layerspathsModified[locationIndex] = opts.ModTime
	}
	return nil
}

// The caller must hold r.mountsLockfile for writing.
// The caller must hold r.inProcessLock for WRITING.
func (r *layerStore) saveMounts() error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to modify the layer store at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	r.mountsLockfile.AssertLockedForWriting()
	mpath := r.mountspath()
	if err := os.MkdirAll(filepath.Dir(mpath), 0o700); err != nil {
		return err
	}
	mounts := make([]layerMountPoint, 0, len(r.layers))
	for _, layer := range r.layers {
		if layer.MountPoint != "" && layer.MountCount > 0 {
			mounts = append(mounts, layerMountPoint{
				ID:         layer.ID,
				MountPoint: layer.MountPoint,
				MountCount: layer.MountCount,
			})
		}
	}
	jmdata, err := json.Marshal(&mounts)
	if err != nil {
		return err
	}

	// This must be done before we write the file, because the process could be terminated
	// after the file is written but before the lock file is updated.
	lw, err := r.mountsLockfile.RecordWrite()
	if err != nil {
		return err
	}
	r.mountsLastWrite = lw

	if err = ioutils.AtomicWriteFile(mpath, jmdata, 0o600); err != nil {
		return err
	}
	return r.loadMounts()
}

func (s *store) newLayerStore(rundir, layerdir, imagedir string, driver drivers.Driver, transient bool) (rwLayerStore, error) {
	if err := os.MkdirAll(rundir, 0o700); err != nil {
		return nil, err
	}
	if err := os.MkdirAll(layerdir, 0o700); err != nil {
		return nil, err
	}
	if imagedir != "" {
		if err := os.MkdirAll(imagedir, 0o700); err != nil {
			return nil, err
		}
	}
	// Note: While the containers.lock file is in rundir for transient stores
	// we don't want to do this here, because the non-transient layers in
	// layers.json might be used externally as a read-only layer (using e.g.
	// additionalimagestores), and that would look for the lockfile in the
	// same directory
	var lockFiles []*lockfile.LockFile
	lockFile, err := lockfile.GetLockFile(filepath.Join(layerdir, "layers.lock"))
	if err != nil {
		return nil, err
	}
	lockFiles = append(lockFiles, lockFile)
	if imagedir != "" {
		lockFile, err := lockfile.GetLockFile(filepath.Join(imagedir, "layers.lock"))
		if err != nil {
			return nil, err
		}
		lockFiles = append(lockFiles, lockFile)
	}

	mountsLockfile, err := lockfile.GetLockFile(filepath.Join(rundir, "mountpoints.lock"))
	if err != nil {
		return nil, err
	}
	volatileDir := layerdir
	if transient {
		volatileDir = rundir
	}
	rlstore := layerStore{
		lockfile:       newMultipleLockFile(lockFiles...),
		mountsLockfile: mountsLockfile,
		rundir:         rundir,
		jsonPath: [numLayerLocationIndex]string{
			filepath.Join(layerdir, "layers.json"),
			filepath.Join(volatileDir, "volatile-layers.json"),
		},
		layerdir: layerdir,

		byid:    make(map[string]*Layer),
		byname:  make(map[string]*Layer),
		bymount: make(map[string]*Layer),

		driver: driver,
	}
	if err := rlstore.startWritingWithReload(false); err != nil {
		return nil, err
	}
	defer rlstore.stopWriting()
	lw, err := rlstore.lockfile.GetLastWrite()
	if err != nil {
		return nil, err
	}
	rlstore.lastWrite = lw
	// rlstore.mountsLastWrite is initialized inside rlstore.load().
	if _, err := rlstore.load(true); err != nil {
		return nil, err
	}
	return &rlstore, nil
}

func newROLayerStore(rundir string, layerdir string, driver drivers.Driver) (roLayerStore, error) {
	lockfile, err := lockfile.GetROLockFile(filepath.Join(layerdir, "layers.lock"))
	if err != nil {
		return nil, err
	}
	rlstore := layerStore{
		lockfile:       newMultipleLockFile(lockfile),
		mountsLockfile: nil,
		rundir:         rundir,
		jsonPath: [numLayerLocationIndex]string{
			filepath.Join(layerdir, "layers.json"),
			filepath.Join(layerdir, "volatile-layers.json"),
		},
		layerdir: layerdir,

		byid:    make(map[string]*Layer),
		byname:  make(map[string]*Layer),
		bymount: make(map[string]*Layer),

		driver: driver,
	}
	if err := rlstore.startReadingWithReload(false); err != nil {
		return nil, err
	}
	defer rlstore.stopReading()
	lw, err := rlstore.lockfile.GetLastWrite()
	if err != nil {
		return nil, err
	}
	rlstore.lastWrite = lw
	if _, err := rlstore.load(false); err != nil {
		return nil, err
	}
	return &rlstore, nil
}

// Requires startReading or startWriting.
func (r *layerStore) lookup(id string) (*Layer, bool) {
	if layer, ok := r.byid[id]; ok {
		return layer, ok
	} else if layer, ok := r.byname[id]; ok {
		return layer, ok
	} else if longid, err := r.idindex.Get(id); err == nil {
		layer, ok := r.byid[longid]
		return layer, ok
	}
	return nil, false
}

// Requires startReading or startWriting.
func (r *layerStore) Size(name string) (int64, error) {
	layer, ok := r.lookup(name)
	if !ok {
		return -1, ErrLayerUnknown
	}
	// We use the presence of a non-empty digest as an indicator that the size value was intentionally set, and that
	// a zero value is not just present because it was never set to anything else (which can happen if the layer was
	// created by a version of this library that didn't keep track of digest and size information).
	if layer.UncompressedDigest != "" || layer.TOCDigest != "" {
		return layer.UncompressedSize, nil // This may return -1 if only TOCDigest is set
	}
	return -1, nil
}

// Requires startWriting.
func (r *layerStore) ClearFlag(id string, flag string) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to clear flags on layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	delete(layer.Flags, flag)
	return r.saveFor(layer)
}

// Requires startWriting.
func (r *layerStore) SetFlag(id string, flag string, value interface{}) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to set flags on layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	if layer.Flags == nil {
		layer.Flags = make(map[string]interface{})
	}
	layer.Flags[flag] = value
	return r.saveFor(layer)
}

func (r *layerStore) Status() ([][2]string, error) {
	return r.driver.Status(), nil
}

// Requires startWriting.
func (r *layerStore) PutAdditionalLayer(id string, parentLayer *Layer, names []string, aLayer drivers.AdditionalLayer) (layer *Layer, err error) {
	if duplicateLayer, idInUse := r.byid[id]; idInUse {
		return duplicateLayer, ErrDuplicateID
	}
	for _, name := range names {
		if _, nameInUse := r.byname[name]; nameInUse {
			return nil, ErrDuplicateName
		}
	}

	parent := ""
	if parentLayer != nil {
		parent = parentLayer.ID
	}

	info, err := aLayer.Info()
	if err != nil {
		return nil, err
	}
	defer info.Close()
	layer = &Layer{}
	if err := json.NewDecoder(info).Decode(layer); err != nil {
		return nil, err
	}
	layer.ID = id
	layer.Parent = parent
	layer.Created = time.Now().UTC()

	if err := aLayer.CreateAs(id, parent); err != nil {
		return nil, err
	}

	// TODO: check if necessary fields are filled
	r.layers = append(r.layers, layer)
	// This can only fail on duplicate IDs, which shouldn’t happen — and in
	// that case the index is already in the desired state anyway.
	// Implementing recovery from an unlikely and unimportant failure here
	// would be too risky.
	_ = r.idindex.Add(id)
	r.byid[id] = layer
	for _, name := range names { // names got from the additional layer store won't be used
		r.byname[name] = layer
	}
	if layer.CompressedDigest != "" {
		r.bycompressedsum[layer.CompressedDigest] = append(r.bycompressedsum[layer.CompressedDigest], layer.ID)
	}
	if layer.UncompressedDigest != "" {
		r.byuncompressedsum[layer.UncompressedDigest] = append(r.byuncompressedsum[layer.UncompressedDigest], layer.ID)
	}
	if layer.TOCDigest != "" {
		r.bytocsum[layer.TOCDigest] = append(r.bytocsum[layer.TOCDigest], layer.ID)
	}
	if err := r.saveFor(layer); err != nil {
		if e := r.Delete(layer.ID); e != nil {
			logrus.Errorf("While recovering from a failure to save layers, error deleting layer %#v: %v", id, e)
		}
		return nil, err
	}
	return copyLayer(layer), nil
}

// Requires startWriting.
func (r *layerStore) create(id string, parentLayer *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, diff io.Reader, slo *stagedLayerOptions) (layer *Layer, size int64, err error) {
	if moreOptions == nil {
		moreOptions = &LayerOptions{}
	}
	if !r.lockfile.IsReadWrite() {
		return nil, -1, fmt.Errorf("not allowed to create new layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	if err := os.MkdirAll(r.rundir, 0o700); err != nil {
		return nil, -1, err
	}
	if err := os.MkdirAll(r.layerdir, 0o700); err != nil {
		return nil, -1, err
	}
	if id == "" {
		id = stringid.GenerateRandomID()
		_, idInUse := r.byid[id]
		for idInUse {
			id = stringid.GenerateRandomID()
			_, idInUse = r.byid[id]
		}
	}
	if duplicateLayer, idInUse := r.byid[id]; idInUse {
		return duplicateLayer, -1, ErrDuplicateID
	}
	names = dedupeStrings(names)
	for _, name := range names {
		if _, nameInUse := r.byname[name]; nameInUse {
			return nil, -1, ErrDuplicateName
		}
	}
	parent := ""
	if parentLayer != nil {
		parent = parentLayer.ID
	}
	var (
		templateIDMappings         *idtools.IDMappings
		templateMetadata           string
		templateCompressedDigest   digest.Digest
		templateCompressedSize     int64
		templateUncompressedDigest digest.Digest
		templateTOCDigest          digest.Digest
		templateUncompressedSize   int64
		templateCompressionType    archive.Compression
		templateUIDs, templateGIDs []uint32
		templateTSdata             []byte
	)
	if moreOptions.TemplateLayer != "" {
		templateLayer, ok := r.lookup(moreOptions.TemplateLayer)
		if !ok {
			return nil, -1, ErrLayerUnknown
		}
		templateMetadata = templateLayer.Metadata
		templateIDMappings = idtools.NewIDMappingsFromMaps(templateLayer.UIDMap, templateLayer.GIDMap)
		templateTOCDigest = templateLayer.TOCDigest
		templateCompressedDigest, templateCompressedSize = templateLayer.CompressedDigest, templateLayer.CompressedSize
		templateUncompressedDigest, templateUncompressedSize = templateLayer.UncompressedDigest, templateLayer.UncompressedSize
		templateCompressionType = templateLayer.CompressionType
		templateUIDs, templateGIDs = slices.Clone(templateLayer.UIDs), slices.Clone(templateLayer.GIDs)
		templateTSdata, err = os.ReadFile(r.tspath(templateLayer.ID))
		if err != nil && !errors.Is(err, os.ErrNotExist) {
			return nil, -1, err
		}
	} else {
		templateIDMappings = &idtools.IDMappings{}
	}
	if mountLabel != "" {
		selinux.ReserveLabel(mountLabel)
	}

	// Before actually creating the layer, make a persistent record of it
	// with the incomplete flag set, so that future processes have a chance
	// to clean up after it.
	layer = &Layer{
		ID:                 id,
		Parent:             parent,
		Names:              names,
		MountLabel:         mountLabel,
		Metadata:           templateMetadata,
		Created:            time.Now().UTC(),
		CompressedDigest:   templateCompressedDigest,
		CompressedSize:     templateCompressedSize,
		UncompressedDigest: templateUncompressedDigest,
		TOCDigest:          templateTOCDigest,
		UncompressedSize:   templateUncompressedSize,
		CompressionType:    templateCompressionType,
		UIDs:               templateUIDs,
		GIDs:               templateGIDs,
		Flags:              newMapFrom(moreOptions.Flags),
		UIDMap:             copySlicePreferringNil(moreOptions.UIDMap),
		GIDMap:             copySlicePreferringNil(moreOptions.GIDMap),
		BigDataNames:       []string{},
		volatileStore:      moreOptions.Volatile,
	}
	layer.Flags[incompleteFlag] = true

	r.layers = append(r.layers, layer)
	// This can only fail if the ID is already missing, which shouldn’t
	// happen — and in that case the index is already in the desired state
	// anyway.  This is on various paths to recover from failures, so this
	// should be robust against partially missing data.
	_ = r.idindex.Add(id)
	r.byid[id] = layer
	for _, name := range names {
		r.byname[name] = layer
	}

	cleanupFailureContext := ""
	defer func() {
		if err != nil {
			// now that the in-memory structures know about the new
			// record, we can use regular Delete() to clean up if
			// anything breaks from here on out
			if cleanupFailureContext == "" {
				cleanupFailureContext = "unknown: cleanupFailureContext not set at the failure site"
			}
			if e := r.Delete(id); e != nil {
				logrus.Errorf("While recovering from a failure (%s), error deleting layer %#v: %v", cleanupFailureContext, id, e)
			}
		}
	}()

	if err = r.saveFor(layer); err != nil {
		cleanupFailureContext = "saving incomplete layer metadata"
		return nil, -1, err
	}

	for _, item := range moreOptions.BigData {
		if err = r.setBigData(layer, item.Key, item.Data); err != nil {
			cleanupFailureContext = fmt.Sprintf("saving big data item %q", item.Key)
			return nil, -1, err
		}
	}

	idMappings := idtools.NewIDMappingsFromMaps(moreOptions.UIDMap, moreOptions.GIDMap)
	opts := drivers.CreateOpts{
		MountLabel: mountLabel,
		StorageOpt: options,
		IDMappings: idMappings,
	}

	var parentMappings, oldMappings *idtools.IDMappings
	if parentLayer != nil {
		parentMappings = idtools.NewIDMappingsFromMaps(parentLayer.UIDMap, parentLayer.GIDMap)
	} else {
		parentMappings = &idtools.IDMappings{}
	}
	if moreOptions.TemplateLayer != "" {
		if err = r.driver.CreateFromTemplate(id, moreOptions.TemplateLayer, templateIDMappings, parent, parentMappings, &opts, writeable); err != nil {
			cleanupFailureContext = fmt.Sprintf("creating a layer from template layer %q", moreOptions.TemplateLayer)
			return nil, -1, fmt.Errorf("creating copy of template layer %q with ID %q: %w", moreOptions.TemplateLayer, id, err)
		}
		oldMappings = templateIDMappings
	} else {
		if writeable {
			if err = r.driver.CreateReadWrite(id, parent, &opts); err != nil {
				cleanupFailureContext = "creating a read-write layer"
				return nil, -1, fmt.Errorf("creating read-write layer with ID %q: %w", id, err)
			}
		} else {
			if err = r.driver.Create(id, parent, &opts); err != nil {
				cleanupFailureContext = "creating a read-only layer"
				return nil, -1, fmt.Errorf("creating read-only layer with ID %q: %w", id, err)
			}
		}
		if parentLayer != nil {
			oldMappings = parentMappings
		}
	}

	if oldMappings != nil &&
		(!reflect.DeepEqual(oldMappings.UIDs(), idMappings.UIDs()) || !reflect.DeepEqual(oldMappings.GIDs(), idMappings.GIDs())) {
		if err = r.driver.UpdateLayerIDMap(id, oldMappings, idMappings, mountLabel); err != nil {
			cleanupFailureContext = "in UpdateLayerIDMap"
			return nil, -1, err
		}
	}

	if len(templateTSdata) > 0 {
		if err = os.MkdirAll(filepath.Dir(r.tspath(id)), 0o700); err != nil {
			cleanupFailureContext = "creating tar-split parent directory for a copy from template"
			return nil, -1, err
		}
		if err = ioutils.AtomicWriteFile(r.tspath(id), templateTSdata, 0o600); err != nil {
			cleanupFailureContext = "creating a tar-split copy from template"
			return nil, -1, err
		}
	}

	size = -1
	if diff != nil {
		if size, err = r.applyDiffWithOptions(layer.ID, moreOptions, diff); err != nil {
			cleanupFailureContext = "applying layer diff"
			return nil, -1, err
		}
	} else if slo != nil {
		if err := r.applyDiffFromStagingDirectory(layer.ID, slo.DiffOutput, slo.DiffOptions); err != nil {
			cleanupFailureContext = "applying staged directory diff"
			return nil, -1, err
		}
	} else {
		// applyDiffWithOptions() would have updated r.bycompressedsum
		// and r.byuncompressedsum for us, but if we used a template
		// layer, we didn't call it, so add the new layer as candidates
		// for searches for layers by checksum
		if layer.CompressedDigest != "" {
			r.bycompressedsum[layer.CompressedDigest] = append(r.bycompressedsum[layer.CompressedDigest], layer.ID)
		}
		if layer.UncompressedDigest != "" {
			r.byuncompressedsum[layer.UncompressedDigest] = append(r.byuncompressedsum[layer.UncompressedDigest], layer.ID)
		}
		if layer.TOCDigest != "" {
			r.bytocsum[layer.TOCDigest] = append(r.bytocsum[layer.TOCDigest], layer.ID)
		}
	}

	delete(layer.Flags, incompleteFlag)
	if err = r.saveFor(layer); err != nil {
		cleanupFailureContext = "saving finished layer metadata"
		return nil, -1, err
	}

	layer = copyLayer(layer)
	return layer, size, err
}

// Requires startReading or startWriting.
func (r *layerStore) Mounted(id string) (int, error) {
	if !r.lockfile.IsReadWrite() {
		return 0, fmt.Errorf("no mount information for layers at %q: %w", r.mountspath(), ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return 0, ErrLayerUnknown
	}
	// NOTE: The caller of this function is not holding (currently cannot hold) r.mountsLockfile,
	// so the data is necessarily obsolete by the time this function returns. So, we don’t even
	// try to reload it in this function, we just rely on r.load() that happened during
	// r.startReading() or r.startWriting().
	return layer.MountCount, nil
}

// Requires startWriting.
func (r *layerStore) Mount(id string, options drivers.MountOpts) (string, error) {
	// LOCKING BUG: This is reachable via store.Diff → layerStore.Diff → layerStore.newFileGetter
	// (with btrfs and zfs graph drivers) holding layerStore only locked for reading, while it modifies
	// - r.layers[].MountCount (directly and via loadMounts / saveMounts)
	// - r.layers[].MountPoint (directly and via loadMounts / saveMounts)
	// - r.bymount (via loadMounts / saveMounts)

	// You are not allowed to mount layers from readonly stores if they
	// are not mounted read/only.
	if !r.lockfile.IsReadWrite() && !slices.Contains(options.Options, "ro") {
		return "", fmt.Errorf("not allowed to update mount locations for layers at %q: %w", r.mountspath(), ErrStoreIsReadOnly)
	}
	r.mountsLockfile.Lock()
	defer r.mountsLockfile.Unlock()
	if err := r.reloadMountsIfChanged(); err != nil {
		return "", err
	}
	layer, ok := r.lookup(id)
	if !ok {
		return "", ErrLayerUnknown
	}
	if layer.MountCount > 0 {
		mounted, err := mount.Mounted(layer.MountPoint)
		if err != nil {
			return "", err
		}
		// If the container is not mounted then we have a condition
		// where the kernel umounted the mount point. This means
		// that the mount count never got decremented.
		if mounted {
			layer.MountCount++
			return layer.MountPoint, r.saveMounts()
		}
	}
	if options.MountLabel == "" {
		options.MountLabel = layer.MountLabel
	}

	if (options.UidMaps != nil || options.GidMaps != nil) && !r.driver.SupportsShifting() {
		if !reflect.DeepEqual(options.UidMaps, layer.UIDMap) || !reflect.DeepEqual(options.GidMaps, layer.GIDMap) {
			return "", fmt.Errorf("cannot mount layer %v: shifting not enabled", layer.ID)
		}
	}
	mountpoint, err := r.driver.Get(id, options)
	if mountpoint != "" && err == nil {
		if layer.MountPoint != "" {
			delete(r.bymount, layer.MountPoint)
		}
		layer.MountPoint = filepath.Clean(mountpoint)
		layer.MountCount++
		r.bymount[layer.MountPoint] = layer
		err = r.saveMounts()
	}
	return mountpoint, err
}

// Requires startWriting.
func (r *layerStore) unmount(id string, force bool, conditional bool) (bool, error) {
	// LOCKING BUG: This is reachable via store.Diff → layerStore.Diff → layerStore.newFileGetter → simpleGetCloser.Close()
	// (with btrfs and zfs graph drivers) holding layerStore only locked for reading, while it modifies
	// - r.layers[].MountCount (directly and via loadMounts / saveMounts)
	// - r.layers[].MountPoint (directly and via loadMounts / saveMounts)
	// - r.bymount (via loadMounts / saveMounts)

	if !r.lockfile.IsReadWrite() {
		return false, fmt.Errorf("not allowed to update mount locations for layers at %q: %w", r.mountspath(), ErrStoreIsReadOnly)
	}
	r.mountsLockfile.Lock()
	defer r.mountsLockfile.Unlock()
	if err := r.reloadMountsIfChanged(); err != nil {
		return false, err
	}
	layer, ok := r.lookup(id)
	if !ok {
		layerByMount, ok := r.bymount[filepath.Clean(id)]
		if !ok {
			return false, ErrLayerUnknown
		}
		layer = layerByMount
	}
	if conditional && layer.MountCount == 0 {
		return false, ErrLayerNotMounted
	}
	if force {
		layer.MountCount = 1
	}
	if layer.MountCount > 1 {
		layer.MountCount--
		return true, r.saveMounts()
	}
	err := r.driver.Put(id)
	if err == nil || os.IsNotExist(err) {
		if layer.MountPoint != "" {
			delete(r.bymount, layer.MountPoint)
		}
		layer.MountCount--
		layer.MountPoint = ""
		return false, r.saveMounts()
	}
	return true, err
}

// Requires startReading or startWriting.
func (r *layerStore) ParentOwners(id string) (uids, gids []int, err error) {
	if !r.lockfile.IsReadWrite() {
		return nil, nil, fmt.Errorf("no mount information for layers at %q: %w", r.mountspath(), ErrStoreIsReadOnly)
	}
	r.mountsLockfile.RLock()
	defer r.mountsLockfile.Unlock()
	// We are not checking r.mountsLockfile.Modified() and calling r.loadMounts here because the store
	// is only locked for reading = we are not allowed to modify layer data.
	// Holding r.mountsLockfile protects us against concurrent mount/unmount operations.
	layer, ok := r.lookup(id)
	if !ok {
		return nil, nil, ErrLayerUnknown
	}
	if len(layer.UIDMap) == 0 && len(layer.GIDMap) == 0 {
		// We're not using any mappings, so there aren't any unmapped IDs on parent directories.
		return nil, nil, nil
	}
	if layer.MountPoint == "" {
		// We don't know which directories to examine.
		return nil, nil, ErrLayerNotMounted
	}
	// Holding r.mountsLockfile protects us against concurrent mount/unmount operations, but we didn’t
	// hold it continuously since the time we loaded the mount data; so it’s possible the layer
	// was unmounted in the meantime, or mounted elsewhere. Treat that as if we were run after the unmount,
	// = a missing mount, not a filesystem error.
	if _, err := system.Lstat(layer.MountPoint); errors.Is(err, os.ErrNotExist) {
		return nil, nil, ErrLayerNotMounted
	}
	rootuid, rootgid, err := idtools.GetRootUIDGID(layer.UIDMap, layer.GIDMap)
	if err != nil {
		return nil, nil, fmt.Errorf("reading root ID values for layer %q: %w", layer.ID, err)
	}
	m := idtools.NewIDMappingsFromMaps(layer.UIDMap, layer.GIDMap)
	fsuids := make(map[int]struct{})
	fsgids := make(map[int]struct{})
	for dir := filepath.Dir(layer.MountPoint); dir != "" && dir != string(os.PathSeparator); dir = filepath.Dir(dir) {
		st, err := system.Stat(dir)
		if err != nil {
			return nil, nil, fmt.Errorf("read directory ownership: %w", err)
		}
		lst, err := system.Lstat(dir)
		if err != nil {
			return nil, nil, err
		}
		fsuid := int(st.UID())
		fsgid := int(st.GID())
		if _, _, err := m.ToContainer(idtools.IDPair{UID: fsuid, GID: rootgid}); err != nil {
			fsuids[fsuid] = struct{}{}
		}
		if _, _, err := m.ToContainer(idtools.IDPair{UID: rootuid, GID: fsgid}); err != nil {
			fsgids[fsgid] = struct{}{}
		}
		fsuid = int(lst.UID())
		fsgid = int(lst.GID())
		if _, _, err := m.ToContainer(idtools.IDPair{UID: fsuid, GID: rootgid}); err != nil {
			fsuids[fsuid] = struct{}{}
		}
		if _, _, err := m.ToContainer(idtools.IDPair{UID: rootuid, GID: fsgid}); err != nil {
			fsgids[fsgid] = struct{}{}
		}
	}
	for uid := range fsuids {
		uids = append(uids, uid)
	}
	for gid := range fsgids {
		gids = append(gids, gid)
	}
	if len(uids) > 1 {
		sort.Ints(uids)
	}
	if len(gids) > 1 {
		sort.Ints(gids)
	}
	return uids, gids, nil
}

// Requires startWriting.
func (r *layerStore) removeName(layer *Layer, name string) {
	layer.Names = stringSliceWithoutValue(layer.Names, name)
}

// Requires startWriting.
func (r *layerStore) updateNames(id string, names []string, op updateNameOperation) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to change layer name assignments at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	oldNames := layer.Names
	names, err := applyNameOperation(oldNames, names, op)
	if err != nil {
		return err
	}
	for _, name := range oldNames {
		delete(r.byname, name)
	}
	for _, name := range names {
		if otherLayer, ok := r.byname[name]; ok {
			r.removeName(otherLayer, name)
		}
		r.byname[name] = layer
	}
	layer.Names = names
	return r.saveFor(layer)
}

func (r *layerStore) datadir(id string) string {
	return filepath.Join(r.layerdir, id)
}

func (r *layerStore) datapath(id, key string) string {
	return filepath.Join(r.datadir(id), makeBigDataBaseName(key))
}

// Requires startReading or startWriting.
func (r *layerStore) BigData(id, key string) (io.ReadCloser, error) {
	if key == "" {
		return nil, fmt.Errorf("can't retrieve layer big data value for empty name: %w", ErrInvalidBigDataName)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return nil, fmt.Errorf("locating layer with ID %q: %w", id, ErrLayerUnknown)
	}
	return os.Open(r.datapath(layer.ID, key))
}

// Requires startWriting.
func (r *layerStore) SetBigData(id, key string, data io.Reader) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to save data items associated with layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return fmt.Errorf("locating layer with ID %q to write bigdata: %w", id, ErrLayerUnknown)
	}
	return r.setBigData(layer, key, data)
}

func (r *layerStore) setBigData(layer *Layer, key string, data io.Reader) error {
	if key == "" {
		return fmt.Errorf("can't set empty name for layer big data item: %w", ErrInvalidBigDataName)
	}
	err := os.MkdirAll(r.datadir(layer.ID), 0o700)
	if err != nil {
		return err
	}

	// NewAtomicFileWriter doesn't overwrite/truncate the existing inode.
	// BigData() relies on this behaviour when opening the file for read
	// so that it is either accessing the old data or the new one.
	writer, err := ioutils.NewAtomicFileWriter(r.datapath(layer.ID, key), 0o600)
	if err != nil {
		return fmt.Errorf("opening bigdata file: %w", err)
	}

	if _, err := io.Copy(writer, data); err != nil {
		writer.Close()
		return fmt.Errorf("copying bigdata for the layer: %w", err)

	}
	if err := writer.Close(); err != nil {
		return fmt.Errorf("closing bigdata file for the layer: %w", err)
	}

	if !slices.Contains(layer.BigDataNames, key) {
		layer.BigDataNames = append(layer.BigDataNames, key)
		return r.saveFor(layer)
	}
	return nil
}

// Requires startReading or startWriting.
func (r *layerStore) BigDataNames(id string) ([]string, error) {
	layer, ok := r.lookup(id)
	if !ok {
		return nil, fmt.Errorf("locating layer with ID %q to retrieve bigdata names: %w", id, ErrImageUnknown)
	}
	return copySlicePreferringNil(layer.BigDataNames), nil
}

// Requires startReading or startWriting.
func (r *layerStore) Metadata(id string) (string, error) {
	if layer, ok := r.lookup(id); ok {
		return layer.Metadata, nil
	}
	return "", ErrLayerUnknown
}

// Requires startWriting.
func (r *layerStore) SetMetadata(id, metadata string) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to modify layer metadata at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	if layer, ok := r.lookup(id); ok {
		layer.Metadata = metadata
		return r.saveFor(layer)
	}
	return ErrLayerUnknown
}

func (r *layerStore) tspath(id string) string {
	return filepath.Join(r.layerdir, id+tarSplitSuffix)
}

// layerHasIncompleteFlag returns true if layer.Flags contains an incompleteFlag set to true
// The caller must hold r.inProcessLock for reading.
func layerHasIncompleteFlag(layer *Layer) bool {
	if layer.Flags == nil {
		return false
	}
	if flagValue, ok := layer.Flags[incompleteFlag]; ok {
		if b, ok := flagValue.(bool); ok && b {
			return true
		}
	}
	return false
}

// Requires startWriting.
func (r *layerStore) deleteInternal(id string) error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to delete layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	// Ensure that if we are interrupted, the layer will be cleaned up.
	if !layerHasIncompleteFlag(layer) {
		if layer.Flags == nil {
			layer.Flags = make(map[string]interface{})
		}
		layer.Flags[incompleteFlag] = true
		if err := r.saveFor(layer); err != nil {
			return err
		}
	}
	// We never unset incompleteFlag; below, we remove the entire object from r.layers.
	id = layer.ID
	if err := r.driver.Remove(id); err != nil && !errors.Is(err, os.ErrNotExist) {
		return err
	}
	os.Remove(r.tspath(id))
	os.RemoveAll(r.datadir(id))
	delete(r.byid, id)
	for _, name := range layer.Names {
		delete(r.byname, name)
	}
	// This can only fail if the ID is already missing, which shouldn’t
	// happen — and in that case the index is already in the desired state
	// anyway.  The store’s Delete method is used on various paths to
	// recover from failures, so this should be robust against partially
	// missing data.
	_ = r.idindex.Delete(id)
	mountLabel := layer.MountLabel
	if layer.MountPoint != "" {
		delete(r.bymount, layer.MountPoint)
	}
	r.deleteInDigestMap(id)
	r.layers = slices.DeleteFunc(r.layers, func(candidate *Layer) bool {
		return candidate.ID == id
	})
	if mountLabel != "" && !slices.ContainsFunc(r.layers, func(candidate *Layer) bool {
		return candidate.MountLabel == mountLabel
	}) {
		selinux.ReleaseLabel(mountLabel)
	}
	return nil
}

// Requires startWriting.
func (r *layerStore) deleteInDigestMap(id string) {
	for digest, layers := range r.bycompressedsum {
		if i := slices.Index(layers, id); i != -1 {
			layers = slices.Delete(layers, i, i+1)
			r.bycompressedsum[digest] = layers
		}
	}
	for digest, layers := range r.byuncompressedsum {
		if i := slices.Index(layers, id); i != -1 {
			layers = slices.Delete(layers, i, i+1)
			r.byuncompressedsum[digest] = layers
		}
	}
}

// Requires startWriting.
func (r *layerStore) Delete(id string) error {
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	id = layer.ID
	// The layer may already have been explicitly unmounted, but if not, we
	// should try to clean that up before we start deleting anything at the
	// driver level.
	for {
		_, err := r.unmount(id, false, true)
		if err == ErrLayerNotMounted {
			break
		}
		if err != nil {
			return err
		}
	}
	if err := r.deleteInternal(id); err != nil {
		return err
	}
	return r.saveFor(layer)
}

// Requires startReading or startWriting.
func (r *layerStore) Exists(id string) bool {
	_, ok := r.lookup(id)
	return ok
}

// Requires startReading or startWriting.
func (r *layerStore) Get(id string) (*Layer, error) {
	if layer, ok := r.lookup(id); ok {
		return copyLayer(layer), nil
	}
	return nil, ErrLayerUnknown
}

// Requires startWriting.
func (r *layerStore) Wipe() error {
	if !r.lockfile.IsReadWrite() {
		return fmt.Errorf("not allowed to delete layers at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}
	ids := make([]string, 0, len(r.byid))
	for id := range r.byid {
		ids = append(ids, id)
	}
	sort.Slice(ids, func(i, j int) bool {
		return r.byid[ids[i]].Created.After(r.byid[ids[j]].Created)
	})
	for _, id := range ids {
		if err := r.Delete(id); err != nil {
			return err
		}
	}
	ids, err := r.driver.ListLayers()
	if err != nil {
		if !errors.Is(err, drivers.ErrNotSupported) {
			return err
		}
		ids = nil
	}
	for _, id := range ids {
		if err := r.driver.Remove(id); err != nil {
			return err
		}
	}
	return nil
}

// Requires startReading or startWriting.
func (r *layerStore) findParentAndLayer(from, to string) (fromID string, toID string, fromLayer, toLayer *Layer, err error) {
	var ok bool
	toLayer, ok = r.lookup(to)
	if !ok {
		return "", "", nil, nil, ErrLayerUnknown
	}
	to = toLayer.ID
	if from == "" {
		from = toLayer.Parent
	}
	if from != "" {
		fromLayer, ok = r.lookup(from)
		if ok {
			from = fromLayer.ID
		} else {
			fromLayer, ok = r.lookup(toLayer.Parent)
			if ok {
				from = fromLayer.ID
			}
		}
	}
	return from, to, fromLayer, toLayer, nil
}

// The caller must hold r.inProcessLock for reading.
func (r *layerStore) layerMappings(layer *Layer) *idtools.IDMappings {
	if layer == nil {
		return &idtools.IDMappings{}
	}
	return idtools.NewIDMappingsFromMaps(layer.UIDMap, layer.GIDMap)
}

// Requires startReading or startWriting.
//
// NOTE: Overlay’s implementation assumes use of an exclusive lock over the primary layer store,
// see drivers/overlay.Driver.getMergedDir.
func (r *layerStore) Changes(from, to string) ([]archive.Change, error) {
	from, to, fromLayer, toLayer, err := r.findParentAndLayer(from, to)
	if err != nil {
		return nil, ErrLayerUnknown
	}
	return r.driver.Changes(to, r.layerMappings(toLayer), from, r.layerMappings(fromLayer), toLayer.MountLabel)
}

type simpleGetCloser struct {
	r    *layerStore
	path string
	id   string
}

func (s *simpleGetCloser) Get(path string) (io.ReadCloser, error) {
	return os.Open(filepath.Join(s.path, path))
}

// LOCKING BUG: See the comments in layerStore.Diff
func (s *simpleGetCloser) Close() error {
	_, err := s.r.unmount(s.id, false, false)
	return err
}

// LOCKING BUG: See the comments in layerStore.Diff
func (r *layerStore) newFileGetter(id string) (drivers.FileGetCloser, error) {
	if getter, ok := r.driver.(drivers.DiffGetterDriver); ok {
		fgc, err := getter.DiffGetter(id)
		if err != nil {
			return nil, err
		}
		if fgc != nil {
			return fgc, nil
		}
	}

	path, err := r.Mount(id, drivers.MountOpts{Options: []string{"ro"}})
	if err != nil {
		return nil, err
	}
	return &simpleGetCloser{
		r:    r,
		path: path,
		id:   id,
	}, nil
}

// writeCompressedData copies data from source to compressor, which is on top of pwriter.
func writeCompressedData(compressor io.WriteCloser, source io.ReadCloser) error {
	defer compressor.Close()
	defer source.Close()
	_, err := io.Copy(compressor, source)
	return err
}

// writeCompressedDataGoroutine copies data from source to compressor, which is on top of pwriter.
// All error must be reported by updating pwriter.
func writeCompressedDataGoroutine(pwriter *io.PipeWriter, compressor io.WriteCloser, source io.ReadCloser) {
	err := errors.New("internal error: unexpected panic in writeCompressedDataGoroutine")
	defer func() { // Note that this is not the same as {defer dest.CloseWithError(err)}; we need err to be evaluated lazily.
		_ = pwriter.CloseWithError(err) // CloseWithError(nil) is equivalent to Close(), always returns nil
	}()
	err = writeCompressedData(compressor, source)
}

// Requires startReading or startWriting.
//
// NOTE: Overlay’s implementation assumes use of an exclusive lock over the primary layer store,
// see drivers/overlay.Driver.getMergedDir.
func (r *layerStore) Diff(from, to string, options *DiffOptions) (io.ReadCloser, error) {
	var metadata storage.Unpacker

	from, to, fromLayer, toLayer, err := r.findParentAndLayer(from, to)
	if err != nil {
		return nil, ErrLayerUnknown
	}
	// Default to applying the type of compression that we noted was used
	// for the layerdiff when it was applied.
	compression := toLayer.CompressionType
	// If a particular compression type (or no compression) was selected,
	// use that instead.
	if options != nil && options.Compression != nil {
		compression = *options.Compression
	}
	maybeCompressReadCloser := func(rc io.ReadCloser) (io.ReadCloser, error) {
		// Depending on whether or not compression is desired, return either the
		// passed-in ReadCloser, or a new one that provides its readers with a
		// compressed version of the data that the original would have provided
		// to its readers.
		if compression == archive.Uncompressed {
			return rc, nil
		}
		preader, pwriter := io.Pipe()
		compressor, err := archive.CompressStream(pwriter, compression)
		if err != nil {
			rc.Close()
			pwriter.Close()
			preader.Close()
			return nil, err
		}
		go writeCompressedDataGoroutine(pwriter, compressor, rc)
		return preader, nil
	}

	if from != toLayer.Parent {
		diff, err := r.driver.Diff(to, r.layerMappings(toLayer), from, r.layerMappings(fromLayer), toLayer.MountLabel)
		if err != nil {
			return nil, err
		}
		return maybeCompressReadCloser(diff)
	}

	if ad, ok := r.driver.(drivers.AdditionalLayerStoreDriver); ok {
		if aLayer, err := ad.LookupAdditionalLayerByID(to); err == nil {
			// This is an additional layer. We leverage blob API for acquiring the reproduced raw blob.
			info, err := aLayer.Info()
			if err != nil {
				aLayer.Release()
				return nil, err
			}
			defer info.Close()
			layer := &Layer{}
			if err := json.NewDecoder(info).Decode(layer); err != nil {
				aLayer.Release()
				return nil, err
			}
			blob, err := aLayer.Blob()
			if err != nil {
				aLayer.Release()
				return nil, err
			}
			// If layer compression type is different from the expected one, decompress and convert it.
			if compression != layer.CompressionType {
				diff, err := archive.DecompressStream(blob)
				if err != nil {
					if err2 := blob.Close(); err2 != nil {
						err = fmt.Errorf("failed to close blob file: %v: %w", err2, err)
					}
					aLayer.Release()
					return nil, err
				}
				rc, err := maybeCompressReadCloser(diff)
				if err != nil {
					if err2 := closeAll(blob.Close, diff.Close); err2 != nil {
						err = fmt.Errorf("failed to cleanup: %v: %w", err2, err)
					}
					aLayer.Release()
					return nil, err
				}
				return ioutils.NewReadCloserWrapper(rc, func() error {
					defer aLayer.Release()
					return closeAll(blob.Close, rc.Close)
				}), nil
			}
			return ioutils.NewReadCloserWrapper(blob, func() error { defer aLayer.Release(); return blob.Close() }), nil
		}
	}

	tsfile, err := os.Open(r.tspath(to))
	if err != nil {
		if !os.IsNotExist(err) {
			return nil, err
		}
		diff, err := r.driver.Diff(to, r.layerMappings(toLayer), from, r.layerMappings(fromLayer), toLayer.MountLabel)
		if err != nil {
			return nil, err
		}
		return maybeCompressReadCloser(diff)
	}

	decompressor, err := pgzip.NewReader(tsfile)
	if err != nil {
		if e := tsfile.Close(); e != nil {
			logrus.Debug(e)
		}
		return nil, err
	}

	metadata = storage.NewJSONUnpacker(decompressor)

	// LOCKING BUG: With btrfs and zfs graph drivers), this uses r.Mount() and r.unmount() holding layerStore only locked for reading
	// but they modify in-memory state.
	fgetter, err := r.newFileGetter(to)
	if err != nil {
		errs := multierror.Append(nil, fmt.Errorf("creating file-getter: %w", err))
		if err := decompressor.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing decompressor: %w", err))
		}
		if err := tsfile.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing tarstream headers: %w", err))
		}
		return nil, errs.ErrorOrNil()
	}

	tarstream := asm.NewOutputTarStream(fgetter, metadata)
	rc := ioutils.NewReadCloserWrapper(tarstream, func() error {
		var errs *multierror.Error
		if err := decompressor.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing decompressor: %w", err))
		}
		if err := tsfile.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing tarstream headers: %w", err))
		}
		if err := tarstream.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing reconstructed tarstream: %w", err))
		}
		if err := fgetter.Close(); err != nil {
			errs = multierror.Append(errs, fmt.Errorf("closing file-getter: %w", err))
		}
		if errs != nil {
			return errs.ErrorOrNil()
		}
		return nil
	})
	return maybeCompressReadCloser(rc)
}

// Requires startReading or startWriting.
func (r *layerStore) DiffSize(from, to string) (size int64, err error) {
	var fromLayer, toLayer *Layer
	from, to, fromLayer, toLayer, err = r.findParentAndLayer(from, to)
	if err != nil {
		return -1, ErrLayerUnknown
	}
	return r.driver.DiffSize(to, r.layerMappings(toLayer), from, r.layerMappings(fromLayer), toLayer.MountLabel)
}

func updateDigestMap(m *map[digest.Digest][]string, oldvalue, newvalue digest.Digest, id string) {
	var newList []string
	if oldvalue != "" {
		for _, value := range (*m)[oldvalue] {
			if value != id {
				newList = append(newList, value)
			}
		}
		if len(newList) > 0 {
			(*m)[oldvalue] = newList
		} else {
			delete(*m, oldvalue)
		}
	}
	if newvalue != "" {
		(*m)[newvalue] = append((*m)[newvalue], id)
	}
}

// Requires startWriting.
func (r *layerStore) ApplyDiff(to string, diff io.Reader) (size int64, err error) {
	return r.applyDiffWithOptions(to, nil, diff)
}

// Requires startWriting.
func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, diff io.Reader) (size int64, err error) {
	if !r.lockfile.IsReadWrite() {
		return -1, fmt.Errorf("not allowed to modify layer contents at %q: %w", r.layerdir, ErrStoreIsReadOnly)
	}

	layer, ok := r.lookup(to)
	if !ok {
		return -1, ErrLayerUnknown
	}

	header := make([]byte, 10240)
	n, err := diff.Read(header)
	if err != nil && err != io.EOF {
		return -1, err
	}
	compression := archive.DetectCompression(header[:n])
	defragmented := io.MultiReader(bytes.NewReader(header[:n]), diff)

	// Decide if we need to compute digests
	var compressedDigest, uncompressedDigest digest.Digest       // = ""
	var compressedDigester, uncompressedDigester digest.Digester // = nil
	if layerOptions != nil && layerOptions.OriginalDigest != "" &&
		layerOptions.OriginalDigest.Algorithm() == digest.Canonical {
		compressedDigest = layerOptions.OriginalDigest
	} else {
		compressedDigester = digest.Canonical.Digester()
	}
	if layerOptions != nil && layerOptions.UncompressedDigest != "" &&
		layerOptions.UncompressedDigest.Algorithm() == digest.Canonical {
		uncompressedDigest = layerOptions.UncompressedDigest
	} else if compression != archive.Uncompressed {
		uncompressedDigester = digest.Canonical.Digester()
	}

	var compressedWriter io.Writer
	if compressedDigester != nil {
		compressedWriter = compressedDigester.Hash()
	} else {
		compressedWriter = io.Discard
	}
	compressedCounter := ioutils.NewWriteCounter(compressedWriter)
	defragmented = io.TeeReader(defragmented, compressedCounter)

	tsdata := bytes.Buffer{}
	uidLog := make(map[uint32]struct{})
	gidLog := make(map[uint32]struct{})
	var uncompressedCounter *ioutils.WriteCounter

	size, err = func() (int64, error) { // A scope for defer
		compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed)
		if err != nil {
			return -1, err
		}
		defer compressor.Close()                                        // This must happen before tsdata is consumed.
		if err := compressor.SetConcurrency(1024*1024, 1); err != nil { // 1024*1024 is the hard-coded default; we're not changing that
			logrus.Infof("setting compression concurrency threads to 1: %v; ignoring", err)
		}
		metadata := storage.NewJSONPacker(compressor)
		uncompressed, err := archive.DecompressStream(defragmented)
		if err != nil {
			return -1, err
		}
		defer uncompressed.Close()
		idLogger, err := tarlog.NewLogger(func(h *tar.Header) {
			if !strings.HasPrefix(path.Base(h.Name), archive.WhiteoutPrefix) {
				uidLog[uint32(h.Uid)] = struct{}{}
				gidLog[uint32(h.Gid)] = struct{}{}
			}
		})
		if err != nil {
			return -1, err
		}
		defer idLogger.Close() // This must happen before uidLog and gidLog is consumed.
		uncompressedCounter = ioutils.NewWriteCounter(idLogger)
		uncompressedWriter := (io.Writer)(uncompressedCounter)
		if uncompressedDigester != nil {
			uncompressedWriter = io.MultiWriter(uncompressedWriter, uncompressedDigester.Hash())
		}
		payload, err := asm.NewInputTarStream(io.TeeReader(uncompressed, uncompressedWriter), metadata, storage.NewDiscardFilePutter())
		if err != nil {
			return -1, err
		}
		options := drivers.ApplyDiffOpts{
			Diff:       payload,
			Mappings:   r.layerMappings(layer),
			MountLabel: layer.MountLabel,
		}
		size, err := r.driver.ApplyDiff(layer.ID, layer.Parent, options)
		if err != nil {
			return -1, err
		}
		return size, err
	}()
	if err != nil {
		return -1, err
	}

	if err := os.MkdirAll(filepath.Dir(r.tspath(layer.ID)), 0o700); err != nil {
		return -1, err
	}
	if err := ioutils.AtomicWriteFile(r.tspath(layer.ID), tsdata.Bytes(), 0o600); err != nil {
		return -1, err
	}
	if compressedDigester != nil {
		compressedDigest = compressedDigester.Digest()
	}
	if uncompressedDigester != nil {
		uncompressedDigest = uncompressedDigester.Digest()
	}
	if uncompressedDigest == "" && compression == archive.Uncompressed {
		uncompressedDigest = compressedDigest
	}

	updateDigestMap(&r.bycompressedsum, layer.CompressedDigest, compressedDigest, layer.ID)
	layer.CompressedDigest = compressedDigest
	if layerOptions != nil && layerOptions.OriginalDigest != "" && layerOptions.OriginalSize != nil {
		layer.CompressedSize = *layerOptions.OriginalSize
	} else {
		layer.CompressedSize = compressedCounter.Count
	}
	updateDigestMap(&r.byuncompressedsum, layer.UncompressedDigest, uncompressedDigest, layer.ID)
	layer.UncompressedDigest = uncompressedDigest
	layer.UncompressedSize = uncompressedCounter.Count
	layer.CompressionType = compression
	layer.UIDs = make([]uint32, 0, len(uidLog))
	for uid := range uidLog {
		layer.UIDs = append(layer.UIDs, uid)
	}
	sort.Slice(layer.UIDs, func(i, j int) bool {
		return layer.UIDs[i] < layer.UIDs[j]
	})
	layer.GIDs = make([]uint32, 0, len(gidLog))
	for gid := range gidLog {
		layer.GIDs = append(layer.GIDs, gid)
	}
	sort.Slice(layer.GIDs, func(i, j int) bool {
		return layer.GIDs[i] < layer.GIDs[j]
	})

	err = r.saveFor(layer)

	return size, err
}

// Requires (startReading or?) startWriting.
func (r *layerStore) DifferTarget(id string) (string, error) {
	ddriver, ok := r.driver.(drivers.DriverWithDiffer)
	if !ok {
		return "", ErrNotSupported
	}
	layer, ok := r.lookup(id)
	if !ok {
		return "", ErrLayerUnknown
	}
	return ddriver.DifferTarget(layer.ID)
}

// Requires startWriting.
func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *drivers.DriverWithDifferOutput, options *drivers.ApplyDiffWithDifferOpts) error {
	ddriver, ok := r.driver.(drivers.DriverWithDiffer)
	if !ok {
		return ErrNotSupported
	}
	layer, ok := r.lookup(id)
	if !ok {
		return ErrLayerUnknown
	}
	if options == nil {
		options = &drivers.ApplyDiffWithDifferOpts{
			ApplyDiffOpts: drivers.ApplyDiffOpts{
				Mappings:   r.layerMappings(layer),
				MountLabel: layer.MountLabel,
			},
			Flags: nil,
		}
	}

	err := ddriver.ApplyDiffFromStagingDirectory(layer.ID, layer.Parent, diffOutput, options)
	if err != nil {
		return err
	}
	layer.UIDs = diffOutput.UIDs
	layer.GIDs = diffOutput.GIDs
	updateDigestMap(&r.byuncompressedsum, layer.UncompressedDigest, diffOutput.UncompressedDigest, layer.ID)
	layer.UncompressedDigest = diffOutput.UncompressedDigest
	updateDigestMap(&r.bycompressedsum, layer.CompressedDigest, diffOutput.CompressedDigest, layer.ID)
	layer.CompressedDigest = diffOutput.CompressedDigest
	updateDigestMap(&r.bytocsum, layer.TOCDigest, diffOutput.TOCDigest, layer.ID)
	layer.TOCDigest = diffOutput.TOCDigest
	layer.UncompressedSize = diffOutput.Size
	layer.Metadata = diffOutput.Metadata
	if options != nil && options.Flags != nil {
		if layer.Flags == nil {
			layer.Flags = make(map[string]interface{})
		}
		maps.Copy(layer.Flags, options.Flags)
	}
	if err = r.saveFor(layer); err != nil {
		return err
	}

	if diffOutput.TarSplit != nil {
		tsdata := bytes.Buffer{}
		compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed)
		if err != nil {
			compressor = pgzip.NewWriter(&tsdata)
		}
		if err := compressor.SetConcurrency(1024*1024, 1); err != nil { // 1024*1024 is the hard-coded default; we're not changing that
			logrus.Infof("setting compression concurrency threads to 1: %v; ignoring", err)
		}
		if _, err := compressor.Write(diffOutput.TarSplit); err != nil {
			compressor.Close()
			return err
		}
		compressor.Close()
		if err := os.MkdirAll(filepath.Dir(r.tspath(layer.ID)), 0o700); err != nil {
			return err
		}
		if err := ioutils.AtomicWriteFile(r.tspath(layer.ID), tsdata.Bytes(), 0o600); err != nil {
			return err
		}
	}
	for k, v := range diffOutput.BigData {
		if err := r.SetBigData(id, k, bytes.NewReader(v)); err != nil {
			if err2 := r.Delete(id); err2 != nil {
				logrus.Errorf("While recovering from a failure to set big data, error deleting layer %#v: %v", id, err2)
			}
			return err
		}
	}
	return err
}

// It must be called without any c/storage locks held to allow differ to make c/storage calls.
func (r *layerStore) applyDiffWithDifferNoLock(options *drivers.ApplyDiffWithDifferOpts, differ drivers.Differ) (*drivers.DriverWithDifferOutput, error) {
	ddriver, ok := r.driver.(drivers.DriverWithDiffer)
	if !ok {
		return nil, ErrNotSupported
	}

	output, err := ddriver.ApplyDiffWithDiffer(options, differ)
	return &output, err
}

func (r *layerStore) CleanupStagingDirectory(stagingDirectory string) error {
	ddriver, ok := r.driver.(drivers.DriverWithDiffer)
	if !ok {
		return ErrNotSupported
	}
	return ddriver.CleanupStagingDirectory(stagingDirectory)
}

// Requires startReading or startWriting.
func (r *layerStore) layersByDigestMap(m map[digest.Digest][]string, d digest.Digest) ([]Layer, error) {
	var layers []Layer
	for _, layerID := range m[d] {
		layer, ok := r.lookup(layerID)
		if !ok {
			return nil, ErrLayerUnknown
		}
		layers = append(layers, *copyLayer(layer))
	}
	return layers, nil
}

// Requires startReading or startWriting.
func (r *layerStore) LayersByCompressedDigest(d digest.Digest) ([]Layer, error) {
	return r.layersByDigestMap(r.bycompressedsum, d)
}

// Requires startReading or startWriting.
func (r *layerStore) LayersByUncompressedDigest(d digest.Digest) ([]Layer, error) {
	return r.layersByDigestMap(r.byuncompressedsum, d)
}

// Requires startReading or startWriting.
func (r *layerStore) LayersByTOCDigest(d digest.Digest) ([]Layer, error) {
	return r.layersByDigestMap(r.bytocsum, d)
}

func closeAll(closes ...func() error) (rErr error) {
	for _, f := range closes {
		if err := f(); err != nil {
			if rErr == nil {
				rErr = fmt.Errorf("close error: %w", err)
				continue
			}
			rErr = fmt.Errorf("%v: %w", err, rErr)
		}
	}
	return
}