Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFT] Epoch Recovery integration test #6823

Open
wants to merge 6 commits into
base: feature/efm-recovery
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cmd/bootstrap/dkg/dkg.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) {
return dkgData, nil
}

skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n),
signature.RandomBeaconThreshold(int(n)), seed)
skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n,
signature.RandomBeaconThreshold(n), seed)
if err != nil {
return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err)
}
Expand Down
65 changes: 50 additions & 15 deletions cmd/bootstrap/run/epochs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import (
"encoding/hex"
"fmt"

"github.com/rs/zerolog"

"github.com/onflow/cadence"
"github.com/onflow/crypto"
"github.com/rs/zerolog"

"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/fvm/systemcontracts"
Expand All @@ -31,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
log.Info().Msg("collecting internal node network and staking keys")
internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

return GenerateRecoverTxArgsWithDKG(
log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
rootChainID,
numViewsInStakingAuction,
numViewsInEpoch,
recoveryEpochTargetDuration,
unsafeAllowOverWrite,
currentEpochCommit.DKGIndexMap,
currentEpochCommit.DKGParticipantKeys,
currentEpochCommit.DKGGroupKey,
snapshot,
)
}

// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction.
// No errors are expected during normal operation.
func GenerateRecoverTxArgsWithDKG(log zerolog.Logger,
internalNodes []bootstrap.NodeInfo,
collectionClusters int,
recoveryEpochCounter uint64,
rootChainID flow.ChainID,
numViewsInStakingAuction uint64,
numViewsInEpoch uint64,
recoveryEpochTargetDuration uint64,
unsafeAllowOverWrite bool,
dkgIndexMap flow.DKGIndexMap,
dkgParticipantKeys []crypto.PublicKey,
dkgGroupKey crypto.PublicKey,
snapshot *inmem.Snapshot,
) ([]cadence.Value, error) {
epoch := snapshot.Epochs().Current()
currentEpochCounter, err := epoch.Counter()
Expand Down Expand Up @@ -81,10 +126,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
partnerCollectors := make(flow.IdentityList, 0)

log.Info().Msg("collecting internal node network and staking keys")
internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson)
if err != nil {
return nil, fmt.Errorf("failed to read full internal node infos: %w", err)
}

internalNodesMap := make(map[flow.Identifier]struct{})
for _, node := range internalNodes {
Expand Down Expand Up @@ -134,31 +175,25 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger,
// The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service
// events, when the intersection between consensus and random beacon committees is too small.

epochProtocolState, err := snapshot.EpochProtocolState()
if err != nil {
return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err)
}
currentEpochCommit := epochProtocolState.EpochCommit()

// NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be
// different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be
// included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed.
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode()))
dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr)
}

// copy DKG index map from the current epoch
dkgIndexMapPairs := make([]cadence.KeyValuePair, 0)
for nodeID, index := range currentEpochCommit.DKGIndexMap {
for nodeID, index := range dkgIndexMap {
dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{
Key: cadence.String(nodeID.String()),
Value: cadence.NewInt(index),
})
}
// copy DKG public keys from the current epoch
dkgPubKeys := make([]cadence.Value, 0)
for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys {
for k, dkgPubKey := range dkgParticipantKeys {
dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode()))
if cdcErr != nil {
return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr)
Expand Down
126 changes: 122 additions & 4 deletions integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package cohort2

import (
"fmt"

"strings"
"testing"
"time"
Expand All @@ -15,11 +14,13 @@ import (
sdk "github.com/onflow/flow-go-sdk"

"github.com/onflow/flow-go/cmd/bootstrap/run"
"github.com/onflow/flow-go/cmd/util/cmd/common"
"github.com/onflow/flow-go/integration/tests/epochs"
"github.com/onflow/flow-go/integration/utils"
"github.com/onflow/flow-go/model/bootstrap"
"github.com/onflow/flow-go/model/flow"
"github.com/onflow/flow-go/model/flow/filter"
"github.com/onflow/flow-go/utils/unittest"
)

func TestRecoverEpoch(t *testing.T) {
Expand All @@ -39,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() {
s.EpochLen = 150
s.FinalizationSafetyThreshold = 20
s.NumOfCollectionClusters = 1
// we need to use 3 consensus nodes to be able to eject a single node from the consensus committee
// and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold
s.NumOfConsensusNodes = 3
// we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and
// have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold.
s.NumOfConsensusNodes = 4

// run the generic setup, which starts up the network
s.BaseSuite.SetupTest()
Expand Down Expand Up @@ -290,3 +291,120 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() {

s.AssertInEpoch(s.Ctx, 1)
}

// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that
// enters Epoch Fallback Mode can successfully recover.
// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with
// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// another node which is part of the Random Beacon committee but not part of the consensus committee.
// another node which is part of the Random Beacon committee but not part of the consensus committee.
// We remove the first consensus node from the Consensus Committee, and the last consensus node from the Random Beacon Committee:
// If the original consensus set is {A, B, C, D} then:
// - the post-recovery consensus committee is {B, C, D}
// - the post-recovery random beacon committee is {A, B, C}
//

// This test will do the following:
// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail.
// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args.
// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
// 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args.
// 5. Submit recover epoch transaction.
// 6. Ensure expected EpochRecover event is emitted.
// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() {
// 1. Manually trigger EFM

// pause the collection node to trigger EFM by failing DKG
ln := s.GetContainersByRole(flow.RoleCollection)[0]
require.NoError(s.T(), ln.Pause())
s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond)
// start the paused collection node now that we are in EFM
require.NoError(s.T(), ln.Start())

// get final view from the latest snapshot
epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView()
require.NoError(s.T(), err)

// Wait for at least the first view past the current epoch's original FinalView to be finalized.
s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1)
s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized view %d", epoch1FinalView+1)

// assert that we are in EFM
snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx)
require.NoError(s.T(), err)
epochPhase, err := snapshot.EpochPhase()
require.NoError(s.T(), err)
require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point")

// 2. Generate transaction arguments for epoch recover transaction.
collectionClusters := s.NumOfCollectionClusters
recoveryEpochCounter := uint64(1)

// read internal node info from one of the consensus nodes
internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus)
internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson)
require.NoError(s.T(), err)
// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args.
// 3. Eject the FIRST consensus node by modifying the snapshot before generating the recover epoch transaction args.

// By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node
// which is not part of the consensus committee but is part of the Random Beacon committee.
currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable
ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0]
ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected

// 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this
// by altering the DKG index map and DKG public key shares.
// This way we ensure that consensus committee has a node which is not part of the Random Beacon committee.
randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))
nConsensusNodes := len(randomBeaconParticipants) - 1

dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes)
for i, participant := range randomBeaconParticipants[:nConsensusNodes] {
dkgIndexMap[participant.NodeID] = i
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
}
}
dkgKeyShares := dkg.KeyShares()[:nConsensusNodes]

Putting this next to the corresponding index map construction logic


epochProtocolState, err := snapshot.EpochProtocolState()
require.NoError(s.T(), err)
dkg, err := epochProtocolState.DKG()
require.NoError(s.T(), err)

// At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and
// another node which is part of the Random Beacon committee but not part of the consensus committee.
txArgs, err := run.GenerateRecoverTxArgsWithDKG(
s.Log,
internalNodes,
collectionClusters,
recoveryEpochCounter,
flow.Localnet,
s.StakingAuctionLen,
s.EpochLen,
3000,
false,
dkgIndexMap,
dkg.KeyShares()[:nConsensusNodes],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dkg.KeyShares()[:nConsensusNodes],
dkgKeyShares,

dkg.GroupKey(),
snapshot,
)
require.NoError(s.T(), err)

// 5. Submit recover epoch transaction to the network.
env := utils.LocalnetEnv()
result := s.recoverEpoch(env, txArgs)
require.NoError(s.T(), result.Error)
require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed)

// 6. Ensure expected EpochRecover event is emitted.
eventType := ""
for _, evt := range result.Events {
if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") {
eventType = evt.Type
break
}
}
require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type")
events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID})
require.NoError(s.T(), err)
require.Equal(s.T(), events[0].Events[0].Type, eventType)

// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch.
startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64))
s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch)
s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond)
s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch)

s.AssertInEpoch(s.Ctx, 1)
}
Loading