From 6985af5e88b9aa20aec2871741abbe6dd7dc6086 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Wed, 28 Oct 2020 18:10:29 -0700 Subject: [PATCH 1/2] keep retrying the proof until we run out of sectors to skip If we have a bunch of corrupted but not missing sectors on disk, we may need to retry many times before we get a proof to pass. Simply giving up doesn't help anyone. --- storage/wdpost_run.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/storage/wdpost_run.go b/storage/wdpost_run.go index 87438fec3ce..e7bd741627c 100644 --- a/storage/wdpost_run.go +++ b/storage/wdpost_run.go @@ -510,10 +510,10 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di dline.Info, ts *ty skipCount := uint64(0) postSkipped := bitfield.New() - var postOut []proof2.PoStProof - somethingToProve := true + somethingToProve := false - for retries := 0; retries < 5; retries++ { + // Retry until we run out of sectors to prove. + for retries := 0; ; retries++ { var partitions []miner.PoStPartition var sinfos []proof2.SectorInfo for partIdx, partition := range batch { @@ -567,7 +567,6 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di dline.Info, ts *ty if len(sinfos) == 0 { // nothing to prove for this batch - somethingToProve = false break } @@ -585,24 +584,31 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di dline.Info, ts *ty return nil, err } - var ps []abi.SectorID - postOut, ps, err = s.prover.GenerateWindowPoSt(ctx, abi.ActorID(mid), sinfos, abi.PoStRandomness(rand)) + postOut, ps, err := s.prover.GenerateWindowPoSt(ctx, abi.ActorID(mid), sinfos, abi.PoStRandomness(rand)) elapsed := time.Since(tsStart) log.Infow("computing window post", "batch", batchIdx, "elapsed", elapsed) if err == nil { - // Proof generation successful, stop retrying - params.Partitions = append(params.Partitions, partitions...) + if len(postOut) == 0 { + return nil, xerrors.Errorf("received no proofs back from generate window post") + } + // Proof generation successful, stop retrying + somethingToProve = true + params.Partitions = partitions + params.Proofs = postOut break } // Proof generation failed, so retry if len(ps) == 0 { + // If we didn't skip any new sectors, we failed + // for some other reason and we need to abort. return nil, xerrors.Errorf("running window post failed: %w", err) } + // TODO: maybe mark these as faulty somewhere? log.Warnw("generate window post skipped sectors", "sectors", ps, "error", err, "try", retries) @@ -617,12 +623,6 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di dline.Info, ts *ty continue } - if len(postOut) == 0 { - return nil, xerrors.Errorf("received no proofs back from generate window post") - } - - params.Proofs = postOut - posts = append(posts, params) } From 077bc83f7f7bc454ce27b0f44e0de6351d1331c1 Mon Sep 17 00:00:00 2001 From: Steven Allen Date: Fri, 30 Oct 2020 14:00:41 -0700 Subject: [PATCH 2/2] explicitly abort PoSt on context cancellation --- storage/wdpost_run.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/storage/wdpost_run.go b/storage/wdpost_run.go index e7bd741627c..f1da4f2212f 100644 --- a/storage/wdpost_run.go +++ b/storage/wdpost_run.go @@ -612,6 +612,15 @@ func (s *WindowPoStScheduler) runPost(ctx context.Context, di dline.Info, ts *ty log.Warnw("generate window post skipped sectors", "sectors", ps, "error", err, "try", retries) + // Explicitly make sure we haven't aborted this PoSt + // (GenerateWindowPoSt may or may not check this). + // Otherwise, we could try to continue proving a + // deadline after the deadline has ended. + if ctx.Err() != nil { + log.Warnw("aborting PoSt due to context cancellation", "error", ctx.Err(), "deadline", di.Index) + return nil, ctx.Err() + } + skipCount += uint64(len(ps)) for _, sector := range ps { postSkipped.Set(uint64(sector.Number))