Skip to content

Commit

Permalink
GC respects target for max hotstore space
Browse files Browse the repository at this point in the history
  • Loading branch information
ZenGround0 committed Mar 7, 2023
1 parent d07c909 commit e6814fb
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 13 deletions.
2 changes: 1 addition & 1 deletion api/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func VersionForType(nodeType NodeType) (Version, error) {
// semver versions of the rpc api exposed
var (
FullAPIVersion0 = newVer(1, 5, 0)
FullAPIVersion1 = newVer(2, 4, 0)
FullAPIVersion1 = newVer(2, 3, 0)

MinerAPIVersion0 = newVer(1, 5, 0)
WorkerAPIVersion0 = newVer(1, 7, 0)
Expand Down
10 changes: 10 additions & 0 deletions blockstore/splitstore/splitstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ type Config struct {
// A positive value is the number of compactions before a full GC is performed;
// a value of 1 will perform full GC in every compaction.
HotStoreFullGCFrequency uint64

// HotstoreMaxSpaceTarget suggests the max allowed space the hotstore can take.
// This is not a hard limit, it is possible for the hotstore to exceed the target
// for example if state grows massively between compactions. The splitstore
// will make a best effort to avoid overflowing the target and in practice should
// never overflow. This field is used when doing GC at the end of a compaction to
// adaptively choose moving GC
HotstoreMaxSpaceTarget uint64
}

// ChainAccessor allows the Splitstore to access the chain. It will most likely
Expand Down Expand Up @@ -165,6 +173,7 @@ type SplitStore struct {

compactionIndex int64
pruneIndex int64
onlineGCCnt int64

ctx context.Context
cancel func()
Expand Down Expand Up @@ -203,6 +212,7 @@ type SplitStore struct {
szWalk int64
szProtectedTxns int64
szToPurge int64 // expected purges before critical section protections and live marking
szKeys int64 // approximate, not counting keys protected when entering critical section

// protected by txnLk
szMarkedLiveRefs int64
Expand Down
2 changes: 1 addition & 1 deletion blockstore/splitstore/splitstore_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (s *SplitStore) doCheck(curTs *types.TipSet) error {
}
defer visitor.Close() //nolint

size, err := s.walkChain(curTs, boundaryEpoch, boundaryEpoch, visitor,
size := s.walkChain(curTs, boundaryEpoch, boundaryEpoch, visitor,
func(c cid.Cid) error {
if isUnitaryObject(c) {
return errStopWalk
Expand Down
18 changes: 15 additions & 3 deletions blockstore/splitstore/splitstore_compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ var (
)

const (
batchSize = 16384
batchSize = 16384
cidKeySize = 32
)

func (s *SplitStore) HeadChange(_, apply []*types.TipSet) error {
Expand Down Expand Up @@ -518,6 +519,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error {
// might be potentially inconsistent; abort compaction and notify the user to intervene.
return xerrors.Errorf("checkpoint exists; aborting compaction")
}
s.clearSizeMeasurements()

currentEpoch := curTs.Height()
boundaryEpoch := currentEpoch - CompactionBoundary
Expand Down Expand Up @@ -709,6 +711,7 @@ func (s *SplitStore) doCompact(curTs *types.TipSet) error {

log.Infow("compaction stats", "hot", hotCnt, "cold", coldCnt, "purge", purgeCnt, "purge size", szPurge)
s.szToPurge = int64(szPurge)
s.szKeys = int64(hotCnt) * cidKeySize
stats.Record(s.ctx, metrics.SplitstoreCompactionHot.M(int64(hotCnt)))
stats.Record(s.ctx, metrics.SplitstoreCompactionCold.M(int64(coldCnt)))

Expand Down Expand Up @@ -1473,8 +1476,9 @@ func (s *SplitStore) completeCompaction() error {
}
s.compactType = none

// Note: at this point we can start the splitstore; a compaction should run on
// the first head change, which will trigger gc on the hotstore.
// Note: at this point we can start the splitstore; base epoch is not
// incremented here so a compaction should run on the first head
// change, which will trigger gc on the hotstore.
// We don't mind the second (back-to-back) compaction as the head will
// have advanced during marking and coldset accumulation.
return nil
Expand Down Expand Up @@ -1532,6 +1536,14 @@ func (s *SplitStore) completePurge(coldr *ColdSetReader, checkpoint *Checkpoint,
return nil
}

func (s *SplitStore) clearSizeMeasurements() {
s.szKeys = 0
s.szMarkedLiveRefs = 0
s.szProtectedTxns = 0
s.szToPurge = 0
s.szWalk = 0
}

// I really don't like having this code, but we seem to have some occasional DAG references with
// missing constituents. During testing in mainnet *some* of these references *sometimes* appeared
// after a little bit.
Expand Down
55 changes: 47 additions & 8 deletions blockstore/splitstore/splitstore_gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,56 @@ import (
bstore "github.com/filecoin-project/lotus/blockstore"
)

const (
// When < 150 GB of space would remain during moving GC, trigger moving GC
targetThreshold = 150_000_000_000
// Don't attempt moving GC with 50 GB or less would remain during moving GC
targetBuffer = 50_000_000_000
// Fraction of garbage in badger vlog for online GC traversal to collect garbage
aggressiveOnlineGCThreshold = 0.0001
)

func (s *SplitStore) gcHotAfterCompaction() {
// TODO size aware GC
// 1. Add a config value to specify targetted max number of bytes M
// 2. Use measurement of marked hotstore size H (we now have this), actual hostore size T (need to compute this), total move size H + T, approximate purged size P
// 3. Trigger moving GC whenever H + T is within 50 GB of M
// 4. if H + T > M use aggressive online threshold
// 5. Use threshold that covers 3 std devs of vlogs when doing aggresive online. Mean == (H + P) / T, assume normal distribution
// 6. Use threshold that covers 1 or 2 std devs of vlogs when doing regular online GC
// Measure hotstore size, determine if we should do full GC, determine if we can do full GC.
// We should do full GC if
// FullGCFrequency is specified and compaction index matches frequency
// OR HotstoreMaxSpaceTarget is specified and total moving space is within 150 GB of target
// We can do full if
// HotstoreMaxSpaceTarget is not specified
// OR total moving space would not exceed 50 GB below target
//
// a) If we should not do full GC => online GC
// b) If we should do full GC and can => moving GC
// c) If we should do full GC and can't => aggressive online GC
var hotSize int64
var err error
sizer, ok := s.hot.(bstore.BlockstoreSize)
if ok {
hotSize, err = sizer.Size()
if err != nil {
log.Warnf("error getting hotstore size: %s, estimating empty hot store for targeting", err)
hotSize = 0
}
} else {
hotSize = 0
}

copySizeApprox := s.szKeys + s.szMarkedLiveRefs + s.szProtectedTxns + s.szWalk
shouldTarget := s.cfg.HotstoreMaxSpaceTarget > 0 && hotSize+copySizeApprox > int64(s.cfg.HotstoreMaxSpaceTarget)-targetThreshold
shouldFreq := s.cfg.HotStoreFullGCFrequency > 0 && s.compactionIndex%int64(s.cfg.HotStoreFullGCFrequency) == 0
shouldDoFull := shouldTarget || shouldFreq
canDoFull := s.cfg.HotstoreMaxSpaceTarget == 0 || hotSize+copySizeApprox < int64(s.cfg.HotstoreMaxSpaceTarget)-targetBuffer
log.Infof("measured hot store size: %d, approximate new size: %d, should do full %t, can do full %t", hotSize, copySizeApprox, shouldDoFull, canDoFull)

var opts []bstore.BlockstoreGCOption
if s.cfg.HotStoreFullGCFrequency > 0 && s.compactionIndex%int64(s.cfg.HotStoreFullGCFrequency) == 0 {
if shouldDoFull && canDoFull {
opts = append(opts, bstore.WithFullGC(true))
} else if shouldDoFull && !canDoFull {
log.Warnf("Attention! Estimated moving GC size %d is not within safety buffer %d of target max %d, performing aggressive online GC to attempt to bring hotstore size down safely", copySizeApprox, targetBuffer, s.cfg.HotstoreMaxSpaceTarget)
log.Warn("If problem continues you can 1) temporarily allocate more disk space to hotstore and 2) reflect in HotstoreMaxSpaceTarget OR trigger manual move with `lotus chain prune hot-moving`")
log.Warn("If problem continues and you do not have any more disk space you can run continue to manually trigger online GC at agressive thresholds (< 0.01) with `lotus chain prune hot`")

opts = append(opts, bstore.WithThreshold(aggressiveOnlineGCThreshold))
}

if err := s.gcBlockstore(s.hot, opts); err != nil {
Expand Down
11 changes: 11 additions & 0 deletions documentation/en/default-lotus-config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,17 @@
# env var: LOTUS_CHAINSTORE_SPLITSTORE_HOTSTOREFULLGCFREQUENCY
#HotStoreFullGCFrequency = 20

# HotStoreMaxSpaceTarget sets a target max disk size for the hotstore. Splitstore GC
# will run moving GC if disk utilization gets within a threshold (150 GB) of the target.
# Splitstore GC will NOT run moving GC if the total size of the move would get
# within 50 GB of the target, and instead will run a more aggressive online GC.
# If both HotStoreFullGCFrequency and HotStoreMaxSpaceTarget are set then splitstore
# GC will trigger moving GC if either configuration condition is met.
#
# type: uint64
# env var: LOTUS_CHAINSTORE_SPLITSTORE_HOTSTOREMAXSPACETARGET
#HotStoreMaxSpaceTarget = 0


[Cluster]
# EXPERIMENTAL. config to enabled node cluster with raft consensus
Expand Down
11 changes: 11 additions & 0 deletions node/config/doc_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions node/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,13 @@ type Splitstore struct {
// A value of 0 disables, while a value 1 will do full GC in every compaction.
// Default is 20 (about once a week).
HotStoreFullGCFrequency uint64
// HotStoreMaxSpaceTarget sets a target max disk size for the hotstore. Splitstore GC
// will run moving GC if disk utilization gets within a threshold (150 GB) of the target.
// Splitstore GC will NOT run moving GC if the total size of the move would get
// within 50 GB of the target, and instead will run a more aggressive online GC.
// If both HotStoreFullGCFrequency and HotStoreMaxSpaceTarget are set then splitstore
// GC will trigger moving GC if either configuration condition is met.
HotStoreMaxSpaceTarget uint64
}

// // Full Node
Expand Down
1 change: 1 addition & 0 deletions node/modules/blockstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ func SplitBlockstore(cfg *config.Chainstore) func(lc fx.Lifecycle, r repo.Locked
UniversalColdBlocks: cfg.Splitstore.ColdStoreType == "universal",
HotStoreMessageRetention: cfg.Splitstore.HotStoreMessageRetention,
HotStoreFullGCFrequency: cfg.Splitstore.HotStoreFullGCFrequency,
HotstoreMaxSpaceTarget: cfg.Splitstore.HotStoreMaxSpaceTarget,
}
ss, err := splitstore.Open(path, ds, hot, cold, cfg)
if err != nil {
Expand Down

0 comments on commit e6814fb

Please sign in to comment.