forked from vitessio/vitess
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fail VReplication workflows on errors that persist and unrecoverable …
…errors (vitessio#10429) (vitessio#783) * Fail workflow if same error persists too long. Fail for unrecoverable errors also in non-online ddl workflows Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Update max time default to 15m, was 1m for testing purposes Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Leverage vterrors for Equals; attempt to address my own nits Signed-off-by: Matt Lord <mattalord@gmail.com> * sanity: validate range of vreplication_retry_delay and of vreplication_max_time_to_retry_on_error Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Fix flags test Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Remove leftover log.Flush() Signed-off-by: Rohit Nayak <rohit@planetscale.com> * Revert validations min/max settings on retry delay since it is breaking unit tests that set the value to a very small value Signed-off-by: Rohit Nayak <rohit@planetscale.com> * captilize per request Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: Matt Lord <mattalord@gmail.com> Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: Rohit Nayak <57520317+rohit-nayak-ps@users.noreply.github.com> Co-authored-by: Matt Lord <mattalord@gmail.com>
- Loading branch information
1 parent
53e1b7a
commit 71c61dc
Showing
6 changed files
with
159 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
Copyright 2022 The Vitess Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package vreplication | ||
|
||
import ( | ||
"sync" | ||
"time" | ||
|
||
"vitess.io/vitess/go/vt/log" | ||
"vitess.io/vitess/go/vt/vterrors" | ||
) | ||
|
||
/* | ||
* lastError tracks the most recent error for any ongoing process and how long it has persisted. | ||
* The err field should be a vterror so as to ensure we have meaningful error codes, causes, stack | ||
* traces, etc. | ||
*/ | ||
type lastError struct { | ||
name string | ||
err error | ||
firstSeen time.Time | ||
mu sync.Mutex | ||
maxTimeInError time.Duration // if error persists for this long, shouldRetry() will return false | ||
} | ||
|
||
func newLastError(name string, maxTimeInError time.Duration) *lastError { | ||
return &lastError{ | ||
name: name, | ||
maxTimeInError: maxTimeInError, | ||
} | ||
} | ||
|
||
func (le *lastError) record(err error) { | ||
le.mu.Lock() | ||
defer le.mu.Unlock() | ||
if err == nil { | ||
le.err = nil | ||
le.firstSeen = time.Time{} | ||
return | ||
} | ||
if !vterrors.Equals(err, le.err) { | ||
le.firstSeen = time.Now() | ||
le.err = err | ||
} | ||
// The error is unchanged so we don't need to do anything | ||
} | ||
|
||
func (le *lastError) shouldRetry() bool { | ||
le.mu.Lock() | ||
defer le.mu.Unlock() | ||
if !le.firstSeen.IsZero() && time.Since(le.firstSeen) > le.maxTimeInError { | ||
log.Errorf("VReplication encountered the same error continuously since %s, we will assume this is a non-recoverable error and will not retry anymore; the workflow will need to be manually restarted once error '%s' has been addressed", | ||
le.firstSeen.UTC(), le.err) | ||
return false | ||
} | ||
return true | ||
} |
55 changes: 55 additions & 0 deletions
55
go/vt/vttablet/tabletmanager/vreplication/last_error_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
Copyright 2022 The Vitess Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package vreplication | ||
|
||
import ( | ||
"fmt" | ||
"testing" | ||
"time" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestLastError(t *testing.T) { | ||
le := newLastError("test", 100*time.Millisecond) | ||
|
||
t.Run("long running error", func(t *testing.T) { | ||
err1 := fmt.Errorf("test1") | ||
le.record(err1) | ||
require.True(t, le.shouldRetry()) | ||
time.Sleep(150 * time.Millisecond) | ||
require.False(t, le.shouldRetry()) | ||
}) | ||
|
||
t.Run("new long running error", func(t *testing.T) { | ||
err2 := fmt.Errorf("test2") | ||
le.record(err2) | ||
require.True(t, le.shouldRetry()) | ||
for i := 1; i < 10; i++ { | ||
le.record(err2) | ||
} | ||
require.True(t, le.shouldRetry()) | ||
time.Sleep(150 * time.Millisecond) | ||
le.record(err2) | ||
require.False(t, le.shouldRetry()) | ||
}) | ||
|
||
t.Run("no error", func(t *testing.T) { | ||
le.record(nil) | ||
require.True(t, le.shouldRetry()) | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters