Skip to content

Commit b70d235

Browse files
harshil-goelall-seeing-code
authored andcommitted
bug(core): Fixed infinite loop in CommitToDisk (#8614)
Whenever an error happens in CommitToDisk, we retry the function according to x.config.MaxRetries. However, the value was set to -1, which causes the function to get stuck in infinite loop. This leads to dgraph not being able to commit the transaction, nor move forward to newer queries. CommitToDisk function is required to be pushed to disk. In case of an error, different alphas in a group can end up having different data leading to data loss. To avoid such issues, we panic in case we are not able to CommitToDisk after 10 retries. Once the alpha is restarted, if the issue is fixed, the alpha would start to work again. This way, Alphas wont fail silently and we would know if there was any issue occuring. Fixes: https://github.com/dgraph-io/projects/issues/85
1 parent 2ae33e2 commit b70d235

File tree

3 files changed

+23
-4
lines changed

3 files changed

+23
-4
lines changed

worker/draft.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -819,8 +819,10 @@ func (n *node) commitOrAbort(pkey uint64, delta *pb.OracleDelta) error {
819819
return
820820
}
821821
txn.Update()
822-
err := x.RetryUntilSuccess(int(x.Config.MaxRetries),
823-
10*time.Millisecond, func() error {
822+
// We start with 20 ms, so that we end up waiting 5 mins by the end.
823+
// If there is any transient issue, it should get fixed within that timeframe.
824+
err := x.ExponentialRetry(int(x.Config.MaxRetries),
825+
20*time.Millisecond, func() error {
824826
err := txn.CommitToDisk(writer, commit)
825827
if err == badger.ErrBannedKey {
826828
glog.Errorf("Error while writing to banned namespace.")
@@ -832,6 +834,7 @@ func (n *node) commitOrAbort(pkey uint64, delta *pb.OracleDelta) error {
832834
if err != nil {
833835
glog.Errorf("Error while applying txn status to disk (%d -> %d): %v",
834836
start, commit, err)
837+
panic(err)
835838
}
836839
}
837840

worker/server_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ const (
4848
`client_key=; sasl-mechanism=PLAIN; tls=false;`
4949
LimitDefaults = `mutations=allow; query-edge=1000000; normalize-node=10000; ` +
5050
`mutations-nquad=1000000; disallow-drop=false; query-timeout=0ms; txn-abort-after=5m; ` +
51-
` max-retries=-1;max-pending-queries=10000`
51+
` max-retries=10;max-pending-queries=10000`
5252
ZeroLimitsDefaults = `uid-lease=0; refill-interval=30s; disable-admin-http=false;`
5353
GraphQLDefaults = `introspection=true; debug=false; extensions=true; poll-interval=1s; ` +
5454
`lambda-url=;`

x/x.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,11 +613,27 @@ func Max(a, b uint64) uint64 {
613613
return b
614614
}
615615

616+
// ExponentialRetry runs the given function until it succeeds or can no longer be retried.
617+
func ExponentialRetry(maxRetries int, waitAfterFailure time.Duration,
618+
f func() error) error {
619+
var err error
620+
for retry := maxRetries; retry > 0; retry-- {
621+
if err = f(); err == nil {
622+
return nil
623+
}
624+
if waitAfterFailure > 0 {
625+
time.Sleep(waitAfterFailure)
626+
waitAfterFailure *= 2
627+
}
628+
}
629+
return err
630+
}
631+
616632
// RetryUntilSuccess runs the given function until it succeeds or can no longer be retried.
617633
func RetryUntilSuccess(maxRetries int, waitAfterFailure time.Duration,
618634
f func() error) error {
619635
var err error
620-
for retry := maxRetries; retry != 0; retry-- {
636+
for retry := maxRetries; retry > 0; retry-- {
621637
if err = f(); err == nil {
622638
return nil
623639
}

0 commit comments

Comments
 (0)