diff --git a/cmd/bootstrap/dkg/dkg.go b/cmd/bootstrap/dkg/dkg.go index e6dfe1788d4..79bacac26d9 100644 --- a/cmd/bootstrap/dkg/dkg.go +++ b/cmd/bootstrap/dkg/dkg.go @@ -26,8 +26,8 @@ func RandomBeaconKG(n int, seed []byte) (model.ThresholdKeySet, error) { return dkgData, nil } - skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(int(n), - signature.RandomBeaconThreshold(int(n)), seed) + skShares, pkShares, pkGroup, err := crypto.BLSThresholdKeyGen(n, + signature.RandomBeaconThreshold(n), seed) if err != nil { return model.ThresholdKeySet{}, fmt.Errorf("Beacon KeyGen failed: %w", err) } diff --git a/cmd/bootstrap/run/epochs.go b/cmd/bootstrap/run/epochs.go index e242e4cf41b..f3ead512e4f 100644 --- a/cmd/bootstrap/run/epochs.go +++ b/cmd/bootstrap/run/epochs.go @@ -4,9 +4,9 @@ import ( "encoding/hex" "fmt" - "github.com/rs/zerolog" - "github.com/onflow/cadence" + "github.com/onflow/crypto" + "github.com/rs/zerolog" "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/fvm/systemcontracts" @@ -31,6 +31,51 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, recoveryEpochTargetDuration uint64, unsafeAllowOverWrite bool, snapshot *inmem.Snapshot, +) ([]cadence.Value, error) { + log.Info().Msg("collecting internal node network and staking keys") + internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) + if err != nil { + return nil, fmt.Errorf("failed to read full internal node infos: %w", err) + } + + epochProtocolState, err := snapshot.EpochProtocolState() + if err != nil { + return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) + } + currentEpochCommit := epochProtocolState.EpochCommit() + + return GenerateRecoverTxArgsWithDKG( + log, + internalNodes, + collectionClusters, + recoveryEpochCounter, + rootChainID, + numViewsInStakingAuction, + numViewsInEpoch, + recoveryEpochTargetDuration, + unsafeAllowOverWrite, + currentEpochCommit.DKGIndexMap, + currentEpochCommit.DKGParticipantKeys, + currentEpochCommit.DKGGroupKey, + snapshot, + ) +} + +// GenerateRecoverTxArgsWithDKG generates the required transaction arguments for the `recoverEpoch` transaction. +// No errors are expected during normal operation. +func GenerateRecoverTxArgsWithDKG(log zerolog.Logger, + internalNodes []bootstrap.NodeInfo, + collectionClusters int, + recoveryEpochCounter uint64, + rootChainID flow.ChainID, + numViewsInStakingAuction uint64, + numViewsInEpoch uint64, + recoveryEpochTargetDuration uint64, + unsafeAllowOverWrite bool, + dkgIndexMap flow.DKGIndexMap, + dkgParticipantKeys []crypto.PublicKey, + dkgGroupKey crypto.PublicKey, + snapshot *inmem.Snapshot, ) ([]cadence.Value, error) { epoch := snapshot.Epochs().Current() currentEpochCounter, err := epoch.Counter() @@ -81,10 +126,6 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, partnerCollectors := make(flow.IdentityList, 0) log.Info().Msg("collecting internal node network and staking keys") - internalNodes, err := common.ReadFullInternalNodeInfos(log, internalNodePrivInfoDir, nodeConfigJson) - if err != nil { - return nil, fmt.Errorf("failed to read full internal node infos: %w", err) - } internalNodesMap := make(map[flow.Identifier]struct{}) for _, node := range internalNodes { @@ -134,23 +175,17 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, // The EFM Recovery State Machine will heuristically reject recovery attempts (specifically reject EpochRecover Service // events, when the intersection between consensus and random beacon committees is too small. - epochProtocolState, err := snapshot.EpochProtocolState() - if err != nil { - return nil, fmt.Errorf("failed to get epoch protocol state from snapshot: %w", err) - } - currentEpochCommit := epochProtocolState.EpochCommit() - // NOTE: The RecoveryEpoch will re-use the last successful DKG output. This means that the random beacon committee can be // different from the consensus committee. This could happen if the node was ejected from the consensus committee, but it still has to be // included in the DKG committee since the threshold signature scheme operates on pre-defined number of participants and cannot be changed. - dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(currentEpochCommit.DKGGroupKey.Encode())) + dkgGroupKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgGroupKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed to convert Random Beacon group key to cadence representation: %w", cdcErr) } // copy DKG index map from the current epoch dkgIndexMapPairs := make([]cadence.KeyValuePair, 0) - for nodeID, index := range currentEpochCommit.DKGIndexMap { + for nodeID, index := range dkgIndexMap { dkgIndexMapPairs = append(dkgIndexMapPairs, cadence.KeyValuePair{ Key: cadence.String(nodeID.String()), Value: cadence.NewInt(index), @@ -158,7 +193,7 @@ func GenerateRecoverEpochTxArgs(log zerolog.Logger, } // copy DKG public keys from the current epoch dkgPubKeys := make([]cadence.Value, 0) - for k, dkgPubKey := range currentEpochCommit.DKGParticipantKeys { + for k, dkgPubKey := range dkgParticipantKeys { dkgPubKeyCdc, cdcErr := cadence.NewString(hex.EncodeToString(dkgPubKey.Encode())) if cdcErr != nil { return nil, fmt.Errorf("failed convert public beacon key of participant %d to cadence representation: %w", k, cdcErr) diff --git a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go index 385c42c5c88..3167f67c9f1 100644 --- a/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go +++ b/integration/tests/epochs/cohort2/epoch_recover_from_efm_test.go @@ -2,7 +2,6 @@ package cohort2 import ( "fmt" - "strings" "testing" "time" @@ -15,11 +14,13 @@ import ( sdk "github.com/onflow/flow-go-sdk" "github.com/onflow/flow-go/cmd/bootstrap/run" + "github.com/onflow/flow-go/cmd/util/cmd/common" "github.com/onflow/flow-go/integration/tests/epochs" "github.com/onflow/flow-go/integration/utils" "github.com/onflow/flow-go/model/bootstrap" "github.com/onflow/flow-go/model/flow" "github.com/onflow/flow-go/model/flow/filter" + "github.com/onflow/flow-go/utils/unittest" ) func TestRecoverEpoch(t *testing.T) { @@ -39,9 +40,9 @@ func (s *RecoverEpochSuite) SetupTest() { s.EpochLen = 150 s.FinalizationSafetyThreshold = 20 s.NumOfCollectionClusters = 1 - // we need to use 3 consensus nodes to be able to eject a single node from the consensus committee - // and still have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold - s.NumOfConsensusNodes = 3 + // we need to use 4 consensus nodes to be able to eject a single node and still have a super-majority and + // have a Random Beacon committee which meets the protocol.RandomBeaconSafetyThreshold. + s.NumOfConsensusNodes = 4 // run the generic setup, which starts up the network s.BaseSuite.SetupTest() @@ -290,3 +291,120 @@ func (s *RecoverEpochSuite) TestRecoverEpochNodeEjected() { s.AssertInEpoch(s.Ctx, 1) } + +// TestRecoverEpochEjectNodeDifferentDKG ensures that the recover epoch governance transaction flow works as expected, and a network that +// enters Epoch Fallback Mode can successfully recover. +// For this specific scenario, we are testing a scenario where the consensus committee and Random Beacon committee form a symmetric difference with +// cardinality 1. In other words, there is a node which is part of the consensus committee but not part of the Random Beacon committee and +// another node which is part of the Random Beacon committee but not part of the consensus committee. +// This test will do the following: +// 1. Triggers EFM by turning off the sole collection node before the end of the DKG forcing the DKG to fail. +// 2. Generates epoch recover transaction args using the epoch efm-recover-tx-args. +// 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. +// 4. Eject consensus node from the Random Beacon committee by modifying the snapshot before generating the recover epoch transaction args. +// 5. Submit recover epoch transaction. +// 6. Ensure expected EpochRecover event is emitted. +// 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. +func (s *RecoverEpochSuite) TestRecoverEpochEjectNodeDifferentDKG() { + // 1. Manually trigger EFM + + // pause the collection node to trigger EFM by failing DKG + ln := s.GetContainersByRole(flow.RoleCollection)[0] + require.NoError(s.T(), ln.Pause()) + s.AwaitFinalizedView(s.Ctx, s.GetDKGEndView(), 2*time.Minute, 500*time.Millisecond) + // start the paused collection node now that we are in EFM + require.NoError(s.T(), ln.Start()) + + // get final view from the latest snapshot + epoch1FinalView, err := s.Net.BootstrapSnapshot.Epochs().Current().FinalView() + require.NoError(s.T(), err) + + // Wait for at least the first view past the current epoch's original FinalView to be finalized. + s.TimedLogf("waiting for epoch transition (finalized view %d)", epoch1FinalView+1) + s.AwaitFinalizedView(s.Ctx, epoch1FinalView+1, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized view %d", epoch1FinalView+1) + + // assert that we are in EFM + snapshot, err := s.Client.GetLatestProtocolSnapshot(s.Ctx) + require.NoError(s.T(), err) + epochPhase, err := snapshot.EpochPhase() + require.NoError(s.T(), err) + require.Equal(s.T(), flow.EpochPhaseFallback, epochPhase, "network must enter EFM by this point") + + // 2. Generate transaction arguments for epoch recover transaction. + collectionClusters := s.NumOfCollectionClusters + recoveryEpochCounter := uint64(1) + + // read internal node info from one of the consensus nodes + internalNodePrivInfoDir, nodeConfigJson := s.getNodeInfoDirs(flow.RoleConsensus) + internalNodes, err := common.ReadFullInternalNodeInfos(unittest.Logger(), internalNodePrivInfoDir, nodeConfigJson) + require.NoError(s.T(), err) + // 3. Eject consensus node by modifying the snapshot before generating the recover epoch transaction args. + // By ejecting a node from the consensus committee but keeping it in the Random Beacon committee, we ensure that the there is a node + // which is not part of the consensus committee but is part of the Random Beacon committee. + currentIdentityTable := snapshot.Encodable().SealingSegment.LatestProtocolStateEntry().EpochEntry.CurrentEpochIdentityTable + ejectedIdentity := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus))[0] + ejectedIdentity.EpochParticipationStatus = flow.EpochParticipationStatusEjected + + // 4. Modify DKG data by removing the last node of the consensus committee from DKG committee. We can do this + // by altering the DKG index map and DKG public key shares. + // This way we ensure that consensus committee has a node which is not part of the Random Beacon committee. + randomBeaconParticipants := currentIdentityTable.Filter(filter.HasRole[flow.Identity](flow.RoleConsensus)) + nConsensusNodes := len(randomBeaconParticipants) - 1 + + dkgIndexMap := make(flow.DKGIndexMap, nConsensusNodes) + for i, participant := range randomBeaconParticipants[:nConsensusNodes] { + dkgIndexMap[participant.NodeID] = i + } + + epochProtocolState, err := snapshot.EpochProtocolState() + require.NoError(s.T(), err) + dkg, err := epochProtocolState.DKG() + require.NoError(s.T(), err) + + // At this point we have a node which is part of the consensus committee but not part of the Random Beacon committee and + // another node which is part of the Random Beacon committee but not part of the consensus committee. + txArgs, err := run.GenerateRecoverTxArgsWithDKG( + s.Log, + internalNodes, + collectionClusters, + recoveryEpochCounter, + flow.Localnet, + s.StakingAuctionLen, + s.EpochLen, + 3000, + false, + dkgIndexMap, + dkg.KeyShares()[:nConsensusNodes], + dkg.GroupKey(), + snapshot, + ) + require.NoError(s.T(), err) + + // 5. Submit recover epoch transaction to the network. + env := utils.LocalnetEnv() + result := s.recoverEpoch(env, txArgs) + require.NoError(s.T(), result.Error) + require.Equal(s.T(), result.Status, sdk.TransactionStatusSealed) + + // 6. Ensure expected EpochRecover event is emitted. + eventType := "" + for _, evt := range result.Events { + if strings.Contains(evt.Type, "FlowEpoch.EpochRecover") { + eventType = evt.Type + break + } + } + require.NotEmpty(s.T(), eventType, "expected FlowEpoch.EpochRecover event type") + events, err := s.Client.GetEventsForBlockIDs(s.Ctx, eventType, []sdk.Identifier{result.BlockID}) + require.NoError(s.T(), err) + require.Equal(s.T(), events[0].Events[0].Type, eventType) + + // 7. Ensure the network transitions into the recovery epoch and finalizes the first view of the recovery epoch. + startViewOfNextEpoch := uint64(txArgs[1].(cadence.UInt64)) + s.TimedLogf("waiting to transition into recovery epoch (finalized view %d)", startViewOfNextEpoch) + s.AwaitFinalizedView(s.Ctx, startViewOfNextEpoch, 2*time.Minute, 500*time.Millisecond) + s.TimedLogf("observed finalized first view of recovery epoch %d", startViewOfNextEpoch) + + s.AssertInEpoch(s.Ctx, 1) +}