-
Notifications
You must be signed in to change notification settings - Fork 455
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix race condition allowing multiple Tick() calls to a single Shard #409
Changes from 7 commits
c7a65a4
4662910
2363d90
10ce48f
bfbe5dd
d768a90
194c5cc
1195a9b
04affcf
7826ab3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -384,18 +384,19 @@ func (n *dbNamespace) closeShards(shards []databaseShard, blockUntilClosed bool) | |
} | ||
} | ||
|
||
func (n *dbNamespace) Tick(c context.Cancellable) { | ||
func (n *dbNamespace) Tick(c context.Cancellable) error { | ||
shards := n.GetOwnedShards() | ||
|
||
if len(shards) == 0 { | ||
return | ||
return nil | ||
} | ||
|
||
// Tick through the shards sequentially to avoid parallel data flushes | ||
var ( | ||
r tickResult | ||
l sync.Mutex | ||
wg sync.WaitGroup | ||
r tickResult | ||
multiErr xerrors.MultiError | ||
l sync.Mutex | ||
wg sync.WaitGroup | ||
) | ||
for _, shard := range shards { | ||
shard := shard | ||
|
@@ -406,18 +407,20 @@ func (n *dbNamespace) Tick(c context.Cancellable) { | |
if c.IsCancelled() { | ||
return | ||
} | ||
shardResult := shard.Tick(c) | ||
|
||
shardResult, err := shard.Tick(c) | ||
|
||
l.Lock() | ||
r = r.merge(shardResult) | ||
multiErr = multiErr.Add(err) | ||
l.Unlock() | ||
}) | ||
} | ||
|
||
wg.Wait() | ||
|
||
if c.IsCancelled() { | ||
return | ||
return multiErr.FinalError() | ||
} | ||
|
||
n.statsLastTick.Lock() | ||
|
@@ -435,6 +438,8 @@ func (n *dbNamespace) Tick(c context.Cancellable) { | |
n.metrics.tick.madeUnwiredBlocks.Inc(int64(r.madeUnwiredBlocks)) | ||
n.metrics.tick.mergedOutOfOrderBlocks.Inc(int64(r.mergedOutOfOrderBlocks)) | ||
n.metrics.tick.errors.Inc(int64(r.errors)) | ||
|
||
return multiErr.FinalError() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we want to emit stats or set the last tick stats unless err == nil. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed up |
||
} | ||
|
||
func (n *dbNamespace) Write( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ import ( | |
xerrors "github.com/m3db/m3x/errors" | ||
xtime "github.com/m3db/m3x/time" | ||
|
||
uatomic "github.com/uber-go/atomic" | ||
"github.com/uber-go/tally" | ||
) | ||
|
||
|
@@ -56,8 +57,10 @@ const ( | |
) | ||
|
||
var ( | ||
errShardEntryNotFound = errors.New("shard entry not found") | ||
errShardNotOpen = errors.New("shard is not open") | ||
errShardEntryNotFound = errors.New("shard entry not found") | ||
errShardNotOpen = errors.New("shard is not open") | ||
errShardAlreadyTicking = errors.New("shard is already ticking") | ||
errShardClosingTickTerminated = errors.New("shard is closing, terminating tick") | ||
) | ||
|
||
type filesetBeforeFn func(filePathPrefix string, namespace ts.ID, shardID uint32, t time.Time) ([]string, error) | ||
|
@@ -66,7 +69,7 @@ type tickPolicy int | |
|
||
const ( | ||
tickPolicyRegular tickPolicy = iota | ||
tickPolicyForceExpiry | ||
tickPolicyCloseShard | ||
) | ||
|
||
type dbShardState int | ||
|
@@ -100,6 +103,8 @@ type dbShard struct { | |
identifierPool ts.IdentifierPool | ||
contextPool context.Pool | ||
flushState shardFlushState | ||
ticking *uatomic.Bool | ||
tickWg *sync.WaitGroup | ||
runtimeOptsListenClosers []xclose.SimpleCloser | ||
currRuntimeOptions dbShardRuntimeOptions | ||
metrics dbShardMetrics | ||
|
@@ -206,6 +211,8 @@ func newDatabaseShard( | |
identifierPool: opts.IdentifierPool(), | ||
contextPool: opts.ContextPool(), | ||
flushState: newShardFlushState(), | ||
ticking: uatomic.NewBool(false), | ||
tickWg: &sync.WaitGroup{}, | ||
metrics: newDatabaseShardMetrics(scope), | ||
} | ||
d.insertQueue = newDatabaseShardInsertQueue(d.insertSeriesBatch, | ||
|
@@ -360,30 +367,74 @@ func (s *dbShard) Close() error { | |
stopwatch.Stop() | ||
}() | ||
|
||
// NB(prateek): wait till any existing ticks are finished. In the usual | ||
// case, no other ticks are running, and tickWg count is at 0, so the | ||
// call to Wait() will return immediately. | ||
// In the case when there is an existing Tick running, the count for | ||
// tickWg will be > 0, and we'll wait until it's reset to zero, which | ||
// will happen because earlier in this function we set the shard state | ||
// to dbShardStateClosing, which triggers an early termination of | ||
// any active ticks. | ||
s.tickWg.Wait() | ||
|
||
// NB(r): Asynchronously we purge expired series to ensure pressure on the | ||
// GC is not placed all at one time. If the deadline is too low and still | ||
// causes the GC to impact performance when closing shards the deadline | ||
// should be increased. | ||
cancellable := context.NewNoOpCanncellable() | ||
s.tickAndExpire(cancellable, tickPolicyForceExpiry) | ||
_, err := s.tickAndExpire(cancellable, tickPolicyCloseShard) | ||
return err | ||
} | ||
|
||
return nil | ||
func (s *dbShard) isClosing() bool { | ||
s.RLock() | ||
state := s.state | ||
s.RUnlock() | ||
return state == dbShardStateClosing | ||
} | ||
|
||
func (s *dbShard) Tick(c context.Cancellable) tickResult { | ||
func (s *dbShard) Tick(c context.Cancellable) (tickResult, error) { | ||
s.removeAnyFlushStatesTooEarly() | ||
return s.tickAndExpire(c, tickPolicyRegular) | ||
} | ||
|
||
func (s *dbShard) tickAndExpire( | ||
c context.Cancellable, | ||
policy tickPolicy, | ||
) tickResult { | ||
) (tickResult, error) { | ||
terminateDueToClosing := func() bool { | ||
// NB(prateek): we bail out early if the shard is closing, | ||
// unless it's the final tick issued during the Close(). This | ||
// final tick is required to release resources back to our pools. | ||
return policy != tickPolicyCloseShard && s.isClosing() | ||
} | ||
|
||
if terminateDueToClosing() { | ||
return tickResult{}, errShardClosingTickTerminated | ||
} | ||
|
||
// ensure only one tick can execute at a time | ||
if s.ticking.Swap(true) { | ||
// i.e. we were previously ticking | ||
return tickResult{}, errShardAlreadyTicking | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the tickPolicy is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure thing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
// enable Close() to track the lifecycle of the tick | ||
s.tickWg.Add(1) | ||
|
||
defer func() { | ||
// reset ticking state | ||
s.ticking.Store(false) | ||
// indicate we're done ticking | ||
s.tickWg.Done() | ||
}() | ||
|
||
var ( | ||
r tickResult | ||
expired []series.DatabaseSeries | ||
i int | ||
slept time.Duration | ||
r tickResult | ||
expired []series.DatabaseSeries | ||
terminatedTickingDueToClosing bool | ||
i int | ||
slept time.Duration | ||
) | ||
s.RLock() | ||
tickSleepBatch := s.currRuntimeOptions.tickSleepSeriesBatchSize | ||
|
@@ -397,6 +448,10 @@ func (s *dbShard) tickAndExpire( | |
if c.IsCancelled() { | ||
return false | ||
} | ||
if terminateDueToClosing() { | ||
terminatedTickingDueToClosing = true | ||
return false | ||
} | ||
// Throttle the tick | ||
sleepFor := time.Duration(tickSleepBatch) * tickSleepPerSeries | ||
s.sleepFn(sleepFor) | ||
|
@@ -409,7 +464,7 @@ func (s *dbShard) tickAndExpire( | |
switch policy { | ||
case tickPolicyRegular: | ||
result, err = entry.series.Tick() | ||
case tickPolicyForceExpiry: | ||
case tickPolicyCloseShard: | ||
err = series.ErrSeriesAllDatapointsExpired | ||
} | ||
if err == series.ErrSeriesAllDatapointsExpired { | ||
|
@@ -449,7 +504,11 @@ func (s *dbShard) tickAndExpire( | |
s.purgeExpiredSeries(expired) | ||
} | ||
|
||
return r | ||
if terminatedTickingDueToClosing { | ||
return tickResult{}, errShardClosingTickTerminated | ||
} | ||
|
||
return r, nil | ||
} | ||
|
||
func (s *dbShard) purgeExpiredSeries(expired []series.DatabaseSeries) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to pull this in anymore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doh I remember thinking ill forget to revert that :|