Skip to content

Commit

Permalink
bug(core): Fixed infinite loop in CommitToDisk (#8614)
Browse files Browse the repository at this point in the history
Whenever an error happens in CommitToDisk, we retry the function
according to x.config.MaxRetries. However, the value was set to -1,
which causes the function to get stuck in infinite loop. This leads to
dgraph not being able to commit the transaction, nor move forward to
newer queries.

CommitToDisk function is required to be pushed to disk. In case of an
error, different alphas in a group can end up having different data
leading to data loss. To avoid such issues, we panic in case we are not
able to CommitToDisk after 10 retries. Once the alpha is restarted, if
the issue is fixed, the alpha would start to work again. This way,
Alphas wont fail silently and we would know if there was any issue
occuring.

Fixes: https://github.com/dgraph-io/projects/issues/85
  • Loading branch information
harshil-goel authored and all-seeing-code committed Feb 8, 2023
1 parent 2ae33e2 commit b70d235
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 4 deletions.
7 changes: 5 additions & 2 deletions worker/draft.go
Original file line number Diff line number Diff line change
Expand Up @@ -819,8 +819,10 @@ func (n *node) commitOrAbort(pkey uint64, delta *pb.OracleDelta) error {
return
}
txn.Update()
err := x.RetryUntilSuccess(int(x.Config.MaxRetries),
10*time.Millisecond, func() error {
// We start with 20 ms, so that we end up waiting 5 mins by the end.
// If there is any transient issue, it should get fixed within that timeframe.
err := x.ExponentialRetry(int(x.Config.MaxRetries),
20*time.Millisecond, func() error {
err := txn.CommitToDisk(writer, commit)
if err == badger.ErrBannedKey {
glog.Errorf("Error while writing to banned namespace.")
Expand All @@ -832,6 +834,7 @@ func (n *node) commitOrAbort(pkey uint64, delta *pb.OracleDelta) error {
if err != nil {
glog.Errorf("Error while applying txn status to disk (%d -> %d): %v",
start, commit, err)
panic(err)
}
}

Expand Down
2 changes: 1 addition & 1 deletion worker/server_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const (
`client_key=; sasl-mechanism=PLAIN; tls=false;`
LimitDefaults = `mutations=allow; query-edge=1000000; normalize-node=10000; ` +
`mutations-nquad=1000000; disallow-drop=false; query-timeout=0ms; txn-abort-after=5m; ` +
` max-retries=-1;max-pending-queries=10000`
` max-retries=10;max-pending-queries=10000`
ZeroLimitsDefaults = `uid-lease=0; refill-interval=30s; disable-admin-http=false;`
GraphQLDefaults = `introspection=true; debug=false; extensions=true; poll-interval=1s; ` +
`lambda-url=;`
Expand Down
18 changes: 17 additions & 1 deletion x/x.go
Original file line number Diff line number Diff line change
Expand Up @@ -613,11 +613,27 @@ func Max(a, b uint64) uint64 {
return b
}

// ExponentialRetry runs the given function until it succeeds or can no longer be retried.
func ExponentialRetry(maxRetries int, waitAfterFailure time.Duration,
f func() error) error {
var err error
for retry := maxRetries; retry > 0; retry-- {
if err = f(); err == nil {
return nil
}
if waitAfterFailure > 0 {
time.Sleep(waitAfterFailure)
waitAfterFailure *= 2
}
}
return err
}

// RetryUntilSuccess runs the given function until it succeeds or can no longer be retried.
func RetryUntilSuccess(maxRetries int, waitAfterFailure time.Duration,
f func() error) error {
var err error
for retry := maxRetries; retry != 0; retry-- {
for retry := maxRetries; retry > 0; retry-- {
if err = f(); err == nil {
return nil
}
Expand Down

0 comments on commit b70d235

Please sign in to comment.