Skip to content

Commit

Permalink
Verification package: Verified given data-dir.
Browse files Browse the repository at this point in the history
For now verifies whete Backend.cindex is consistent with WAL log,
but should get expanded to cover memberships & revisions.
  • Loading branch information
ptabor committed Apr 27, 2021
1 parent ad85d87 commit af61bb2
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 14 deletions.
20 changes: 20 additions & 0 deletions server/verify/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2021 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package verify

// verify package is analyzing persistent state of etcd to find potential
// inconsistencies.
// In particular it covers cross-checking between different aspacts of etcd
// storage like WAL & Backend.
138 changes: 138 additions & 0 deletions server/verify/verify.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright 2021 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package verify

import (
"fmt"
"os"

"go.etcd.io/etcd/raft/v3/raftpb"
"go.etcd.io/etcd/server/v3/datadir"
"go.etcd.io/etcd/server/v3/etcdserver/cindex"
"go.etcd.io/etcd/server/v3/mvcc/backend"
wal2 "go.etcd.io/etcd/server/v3/wal"
"go.etcd.io/etcd/server/v3/wal/walpb"
"go.uber.org/zap"
)

const ENV_VERIFY = "ETCD_VERIFY"
const ENV_VERIFY_ALL_VALUE = "all"

type Config struct {
// DataDir is a root directory where the data being verified are stored.
DataDir string

// ExactIndex requires consistent_index in backend exactly match the last committed WAL entry.
// Usually backend's consistent_index needs to be <= WAL.commit, but for backups the match
// is expected to be exact.
ExactIndex bool

Logger *zap.Logger
}

// Verify performs consistency checks of given etcd data-directory.
// The errors are reported as the returned error, but for some situations
// the function can also panic.
// The function is expected to work on not-in-use data model, i.e.
// no file-locks should be taken. Verify does not modified the data.
func Verify(cfg Config) error {
lg := cfg.Logger
if lg == nil {
lg = zap.NewNop()
}

var err error
lg.Info("verification of persisted state", zap.String("data-dir", cfg.DataDir))
defer func() {
if err != nil {
lg.Error("verification of persisted state failed",
zap.String("data-dir", cfg.DataDir),
zap.Error(err))
} else if r := recover(); r != nil {
lg.Error("verification of persisted state failed",
zap.String("data-dir", cfg.DataDir))
panic(r)
} else {
lg.Info("verification of persisted state successful", zap.String("data-dir", cfg.DataDir))
}
}()

beConfig := backend.DefaultBackendConfig()
beConfig.Path = datadir.ToBackendFileName(cfg.DataDir)
beConfig.Logger = cfg.Logger

be := backend.New(beConfig)
defer be.Close()

_, hardstate, err := validateWal(cfg)
if err != nil {
return err
}

// TODO: Perform validation of consistency of membership between
// backend/members & WAL confstate (and maybe storev2 if still exists).

return validateConsistentIndex(cfg, hardstate, be)
}

// VerifyIfEnabled performs verification according to ETCD_VERIFY env settings.
// See Verify for more information.
func VerifyIfEnabled(cfg Config) error {
if os.Getenv(ENV_VERIFY) == ENV_VERIFY_ALL_VALUE {
return Verify(cfg)
}
return nil
}

// MustVerifyIfEnabled performs verification according to ETCD_VERIFY env settings
// and exits in case of found problems.
// See Verify for more information.
func MustVerifyIfEnabled(cfg Config) {
if err := VerifyIfEnabled(cfg); err != nil {
cfg.Logger.Panic("Verification failed",
zap.String("data-dir", cfg.DataDir),
zap.Error(err))
}
}

func validateConsistentIndex(cfg Config, hardstate *raftpb.HardState, be backend.Backend) error {
tx := be.BatchTx()
ci := cindex.NewConsistentIndex(tx)
index := ci.ConsistentIndex()
if cfg.ExactIndex && index != hardstate.Commit {
return fmt.Errorf("backend.ConsistentIndex (%v) expected == WAL.HardState.commit (%v)", index, hardstate.Commit)
}
if index > hardstate.Commit {
return fmt.Errorf("backend.ConsistentIndex (%v) must be <= WAL.HardState.commit (%v)", index, hardstate.Commit)
}
cfg.Logger.Info("verification: consistentIndex OK", zap.Uint64("backend-consistent-index", index), zap.Uint64("hardstate-commit", hardstate.Commit))
return nil
}

func validateWal(cfg Config) (*walpb.Snapshot, *raftpb.HardState, error) {
walDir := datadir.ToWalDir(cfg.DataDir)

walSnaps, err := wal2.ValidSnapshotEntries(cfg.Logger, walDir)
if err != nil {
return nil, nil, err
}

snapshot := walSnaps[len(walSnaps)-1]
hardstate, err := wal2.Verify(cfg.Logger, walDir, snapshot)
if err != nil {
return nil, nil, err
}
return &snapshot, hardstate, nil
}
22 changes: 12 additions & 10 deletions server/wal/wal.go
Original file line number Diff line number Diff line change
Expand Up @@ -618,10 +618,11 @@ func ValidSnapshotEntries(lg *zap.Logger, walDir string) ([]walpb.Snapshot, erro
// If it cannot read out the expected snap, it will return ErrSnapshotNotFound.
// If the loaded snap doesn't match with the expected one, it will
// return error ErrSnapshotMismatch.
func Verify(lg *zap.Logger, walDir string, snap walpb.Snapshot) error {
func Verify(lg *zap.Logger, walDir string, snap walpb.Snapshot) (*raftpb.HardState, error) {
var metadata []byte
var err error
var match bool
var state raftpb.HardState

rec := &walpb.Record{}

Expand All @@ -630,14 +631,14 @@ func Verify(lg *zap.Logger, walDir string, snap walpb.Snapshot) error {
}
names, nameIndex, err := selectWALFiles(lg, walDir, snap)
if err != nil {
return err
return nil, err
}

// open wal files in read mode, so that there is no conflict
// when the same WAL is opened elsewhere in write mode
rs, _, closer, err := openWALFiles(lg, walDir, names, nameIndex, false)
if err != nil {
return err
return nil, err
}
defer func() {
if closer != nil {
Expand All @@ -652,46 +653,47 @@ func Verify(lg *zap.Logger, walDir string, snap walpb.Snapshot) error {
switch rec.Type {
case metadataType:
if metadata != nil && !bytes.Equal(metadata, rec.Data) {
return ErrMetadataConflict
return nil, ErrMetadataConflict
}
metadata = rec.Data
case crcType:
crc := decoder.crc.Sum32()
// Current crc of decoder must match the crc of the record.
// We need not match 0 crc, since the decoder is a new one at this point.
if crc != 0 && rec.Validate(crc) != nil {
return ErrCRCMismatch
return nil, ErrCRCMismatch
}
decoder.updateCRC(rec.Crc)
case snapshotType:
var loadedSnap walpb.Snapshot
pbutil.MustUnmarshal(&loadedSnap, rec.Data)
if loadedSnap.Index == snap.Index {
if loadedSnap.Term != snap.Term {
return ErrSnapshotMismatch
return nil, ErrSnapshotMismatch
}
match = true
}
// We ignore all entry and state type records as these
// are not necessary for validating the WAL contents
case entryType:
case stateType:
pbutil.MustUnmarshal(&state, rec.Data)
default:
return fmt.Errorf("unexpected block type %d", rec.Type)
return nil, fmt.Errorf("unexpected block type %d", rec.Type)
}
}

// We do not have to read out all the WAL entries
// as the decoder is opened in read mode.
if err != io.EOF && err != io.ErrUnexpectedEOF {
return err
return nil, err
}

if !match {
return ErrSnapshotNotFound
return nil, ErrSnapshotNotFound
}

return nil
return &state, nil
}

// cut closes current file written and creates a new one ready to append.
Expand Down
14 changes: 10 additions & 4 deletions server/wal/wal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ import (
"regexp"
"testing"

"github.com/stretchr/testify/assert"
"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/pkg/v3/pbutil"
"go.etcd.io/etcd/raft/v3/raftpb"
"go.etcd.io/etcd/server/v3/wal/walpb"
"go.uber.org/zap/zaptest"

"go.uber.org/zap"
)
Expand Down Expand Up @@ -231,14 +233,14 @@ func TestOpenAtIndex(t *testing.T) {
// The test creates a WAL directory and cuts out multiple WAL files. Then
// it corrupts one of the files by completely truncating it.
func TestVerify(t *testing.T) {
lg := zaptest.NewLogger(t)
walDir, err := ioutil.TempDir(t.TempDir(), "waltest")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(walDir)

// create WAL
w, err := Create(zap.NewExample(), walDir, nil)
w, err := Create(lg, walDir, nil)
if err != nil {
t.Fatal(err)
}
Expand All @@ -255,11 +257,15 @@ func TestVerify(t *testing.T) {
}
}

hs := raftpb.HardState{Term: 1, Vote: 3, Commit: 5}
assert.NoError(t, w.Save(hs, nil))

// to verify the WAL is not corrupted at this point
err = Verify(zap.NewExample(), walDir, walpb.Snapshot{})
hardstate, err := Verify(lg, walDir, walpb.Snapshot{})
if err != nil {
t.Errorf("expected a nil error, got %v", err)
}
assert.Equal(t, hs, *hardstate)

walFiles, err := ioutil.ReadDir(walDir)
if err != nil {
Expand All @@ -272,7 +278,7 @@ func TestVerify(t *testing.T) {
t.Fatal(err)
}

err = Verify(zap.NewExample(), walDir, walpb.Snapshot{})
_, err = Verify(lg, walDir, walpb.Snapshot{})
if err == nil {
t.Error("expected a non-nil error, got nil")
}
Expand Down

0 comments on commit af61bb2

Please sign in to comment.