Skip to content

Commit

Permalink
Ln/max tip age bug fix (#1392)
Browse files Browse the repository at this point in the history
* max tip age fix

* move to server.go

* start it

* hotfix

* hotfix

* Add more logging

* Fix domain logging

* 10x base duration

* down to 3x

* revert

* more logging

* Improve logging and fix csv flag issue

* Remove redundant outbound ip group check

* Fix flag

* Dont erase node ever...

* Add logging

* Finally fix string slice bs

* ugh

* Reject unroutable addresses

* Allow unroutable for inbound

* Allow unroutable

* hotfix

* hotfix 2

* cleanup

* gofmt

* address DH feedback

* adjust max tip age pos

* remove printAllActiveValidators

* 24h for max tip age

* address feedback

* remove more logging

---------

Co-authored-by: diamondhands <diamondhands@bitcloutdev.com>
  • Loading branch information
lazynina and diamondhands0 authored Jul 25, 2024
1 parent c0f4967 commit 57373bf
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 50 deletions.
25 changes: 20 additions & 5 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"os"
"path/filepath"
"strings"
)

type Config struct {
Expand Down Expand Up @@ -87,6 +88,17 @@ type Config struct {
CheckpointSyncingProviders []string
}

// Viper doesn't work when you have environment variables. This is the
// suggested workaround:
// https://github.com/spf13/viper/issues/380
func GetStringSliceWorkaround(flagName string) []string {
value := viper.GetString(flagName)
if value == "" || value == " " {
return []string{}
}
return strings.Split(value, ",")
}

func LoadConfig() *Config {
config := Config{}

Expand Down Expand Up @@ -134,9 +146,10 @@ func LoadConfig() *Config {
config.TransactionValidationRefreshIntervalMillis = viper.GetUint64("transaction-validation-refresh-interval-millis")

// Peers
config.ConnectIPs = viper.GetStringSlice("connect-ips")
config.AddIPs = viper.GetStringSlice("add-ips")
config.AddSeeds = viper.GetStringSlice("add-seeds")
config.ConnectIPs = GetStringSliceWorkaround("connect-ips")
glog.V(2).Infof("Connect IPs read in: %v", config.ConnectIPs)
config.AddIPs = GetStringSliceWorkaround("add-ips")
config.AddSeeds = GetStringSliceWorkaround("add-seeds")
config.TargetOutboundPeers = viper.GetUint32("target-outbound-peers")
config.StallTimeoutSeconds = viper.GetUint64("stall-timeout-seconds")

Expand All @@ -152,7 +165,7 @@ func LoadConfig() *Config {
config.PeerConnectionRefreshIntervalMillis = viper.GetUint64("peer-connection-refresh-interval-millis")

// Mining + Admin
config.MinerPublicKeys = viper.GetStringSlice("miner-public-keys")
config.MinerPublicKeys = GetStringSliceWorkaround("miner-public-keys")
config.NumMiningThreads = viper.GetUint64("num-mining-threads")

// Fees
Expand All @@ -165,7 +178,9 @@ func LoadConfig() *Config {
config.BlockCypherAPIKey = viper.GetString("block-cypher-api-key")
config.BlockProducerSeed = viper.GetString("block-producer-seed")
config.TrustedBlockProducerStartHeight = viper.GetUint64("trusted-block-producer-start-height")
// TODO: Couldn't get this to work with environement variable
config.TrustedBlockProducerPublicKeys = viper.GetStringSlice("trusted-block-producer-public-keys")
glog.V(2).Infof("Trusted Block Producer Public Keys: %v", config.TrustedBlockProducerPublicKeys)

// Logging
config.LogDirectory = viper.GetString("log-dir")
Expand All @@ -183,7 +198,7 @@ func LoadConfig() *Config {
config.StateSyncerMempoolTxnSyncLimit = viper.GetUint64("state-syncer-mempool-txn-sync-limit")

// PoS Checkpoint Syncing
config.CheckpointSyncingProviders = viper.GetStringSlice("checkpoint-syncing-providers")
config.CheckpointSyncingProviders = GetStringSliceWorkaround("checkpoint-syncing-providers")
for _, provider := range config.CheckpointSyncingProviders {
if _, err := url.ParseRequestURI(provider); err != nil {
glog.Fatalf("Invalid checkpoint syncing provider URL: %v", provider)
Expand Down
18 changes: 11 additions & 7 deletions cmd/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (node *Node) Start(exitChannels ...*chan struct{}) {
// records to the DB. In this case, the snapshot is corrupted and needs to be computed. See the
// comment at the top of snapshot.go for more information on how this works.
if shouldRestart {
glog.Infof(lib.CLog(lib.Red, fmt.Sprintf("Start: Got en error while starting server and shouldRestart "+
glog.Infof(lib.CLog(lib.Red, fmt.Sprintf("Start: Got an error while starting server and shouldRestart "+
"is true. Node will be erased and resynced. Error: (%v)", err)))
node.nodeMessageChan <- lib.NodeErase
return
Expand Down Expand Up @@ -383,12 +383,16 @@ func (node *Node) listenToNodeMessages(exitChannels ...*chan struct{}) {
glog.Infof("Node.listenToNodeMessages: Finished stopping node")
switch operation {
case lib.NodeErase:
if err := os.RemoveAll(node.Config.DataDirectory); err != nil {
glog.Fatal(lib.CLog(lib.Red, fmt.Sprintf("IMPORTANT: Problem removing the directory (%v), you "+
"should run `rm -rf %v` to delete it manually. Error: (%v)", node.Config.DataDirectory,
node.Config.DataDirectory, err)))
return
}
glog.Error("Not actually erasing node")
// TODO: Clean up this path. This NodeErase code was added when we upgraded to HyperSync,
// but it's not worth compromising the node if a false positive sends us here.

//if err := os.RemoveAll(node.Config.DataDirectory); err != nil {
// glog.Fatal(lib.CLog(lib.Red, fmt.Sprintf("IMPORTANT: Problem removing the directory (%v), you "+
// "should run `rm -rf %v` to delete it manually. Error: (%v)", node.Config.DataDirectory,
// node.Config.DataDirectory, err)))
// return
//}
}

glog.Infof("Node.listenToNodeMessages: Restarting node")
Expand Down
27 changes: 27 additions & 0 deletions consensus/event_loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,14 @@ func (fe *fastHotStuffEventLoop) tryConstructVoteQCInCurrentView() *FastHotStuff
// Fetch the validator list at the tip.
validatorList := fe.tip.validatorList

for _, validatorItem := range validatorList {
glog.V(2).Infof("Validator: Key: %v, Stake: %v, Domains: %v",
validatorItem.GetPublicKey().ToString(),
validatorItem.GetStakeAmount().ToBig().String(),
DomainsToString(validatorItem.GetDomains()),
)
}

// Compute the chain tip's signature payload.
voteSignaturePayload := GetVoteSignaturePayload(tipBlock.GetView(), tipBlock.GetBlockHash())

Expand Down Expand Up @@ -711,6 +719,10 @@ func (fe *fastHotStuffEventLoop) tryConstructVoteQCInCurrentView() *FastHotStuff
signatures = append(signatures, vote.GetSignature())
}

glog.V(2).Infof("FastHotStuffEventLoop.tryConstructVoteQCInCurrentView: "+
"Total Stake: %s, Total Voting Stake: %s, Signers List: %v, Num Signatures: %d",
totalStake.ToBig().String(), totalVotingStake.ToBig().String(), signersList, len(signatures))

// If we don't have a super-majority vote for the chain tip, then we can't build a QC.
if !isSuperMajorityStake(totalVotingStake, totalStake) {
return nil
Expand Down Expand Up @@ -756,6 +768,17 @@ func (fe *fastHotStuffEventLoop) tryConstructTimeoutQCInCurrentView() *FastHotSt
// proposed in the next view. So if we want to propose a timeout QC in the current view, we need
// to aggregate timeouts from the previous one.
timeoutsByValidator := fe.timeoutsSeenByView[fe.currentView-1]
glog.V(2).Infof("FastHotStuffEventLoop.tryConstructTimeoutQCInCurrentView: " +
"Printing timeouts by validator: ")
for key := range timeoutsByValidator {
validatorItem, exists := fe.tip.validatorLookup[key]
if !exists {
glog.V(2).Infof("Validator not found for key %v", key)
continue
}
glog.V(2).Infof("Validator: Key: %v, Stake: %v, Domains: %v",
key, validatorItem.GetStakeAmount().ToBig().String(), DomainsToString(validatorItem.GetDomains()))
}

// Tracks the highQC from validators as we go along.
var validatorsHighQC QuorumCertificate
Expand Down Expand Up @@ -827,6 +850,10 @@ func (fe *fastHotStuffEventLoop) tryConstructTimeoutQCInCurrentView() *FastHotSt
highQCViews[ii] = timeout.GetHighQC().GetView()
}

glog.V(2).Infof("FastHotStuffEventLoop.tryConstructTimeoutQCInCurrentView: "+
"totalStake: %s, totalTimedOutStake: %s",
totalStake.ToBig().String(), totalTimedOutStake.ToBig().String())

// Check if we have a super majority of stake that has timed out
if !isSuperMajorityStake(totalTimedOutStake, totalStake) {
return nil
Expand Down
21 changes: 20 additions & 1 deletion consensus/integration_test_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package consensus

import (
"github.com/golang/glog"
"sync"
"time"

Expand Down Expand Up @@ -110,6 +111,18 @@ func (node *validatorNode) GetDomains() [][]byte {
return [][]byte{}
}

func DomainsToString(domainsBytes [][]byte) string {
domains := ""
for _, domain := range domainsBytes {
domains += string(domain) + ", "
}
return domains
}

func (node *validatorNode) GetDomainsString() string {
return DomainsToString(node.GetDomains())
}

func (node *validatorNode) ProcessBlock(incomingBlock *block) {
node.lock.Lock()
defer node.lock.Unlock()
Expand Down Expand Up @@ -313,7 +326,10 @@ func (node *validatorNode) ProcessTimeout(timeout TimeoutMessage) {
return
}

node.eventLoop.ProcessValidatorTimeout(timeout)
if err := node.eventLoop.ProcessValidatorTimeout(timeout); err != nil {
glog.V(2).Infof("ProcessTimeout: Error processing timeout from validator %v: %v",
timeout.GetPublicKey().ToString(), err)
}
}

func (node *validatorNode) Start() {
Expand Down Expand Up @@ -400,6 +416,9 @@ func (node *validatorNode) broadcastTimeout(event *FastHotStuffEvent) {

// Broadcast the block to all validators.
for _, validator := range node.validatorNodes {
glog.V(2).Infof("broadcastTimeout: Broadcasting timeout message from validator "+
"%v (%v) to validator %v (%v)", node.GetDomainsString(), node.GetPublicKey().ToString(),
validator.GetDomainsString(), validator.GetPublicKey().ToString())
go validator.ProcessTimeout(timeout)
}
}
Expand Down
15 changes: 13 additions & 2 deletions lib/blockchain.go
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,16 @@ type Blockchain struct {
timer *Timer
}

func (bc *Blockchain) getHighestCheckpointView() uint64 {
bc.checkpointBlockInfoLock.RLock()
defer bc.checkpointBlockInfoLock.RUnlock()

if bc.checkpointBlockInfo != nil {
return bc.checkpointBlockInfo.LatestView
}
return uint64(0)
}

func (bc *Blockchain) updateCheckpointBlockInfo() {
if len(bc.checkpointSyncingProviders) == 0 {
glog.V(2).Info("updateCheckpointBlockInfo: No checkpoint syncing providers set. Skipping update.")
Expand All @@ -580,7 +590,7 @@ func (bc *Blockchain) updateCheckpointBlockInfo() {
// from the checkpoint syncing providers. We'll combine these two pieces of information to
// form the final checkpoint block info.
var highestHeightCheckpointBlockInfo *CheckpointBlockInfo
highestView := uint64(0)
highestView := bc.getHighestCheckpointView()
for _, checkpointBlockInfo := range checkpointBlockInfos {
if checkpointBlockInfo.Error != nil {
glog.Errorf("updateCheckpointBlockInfo: Error getting checkpoint block info: %v", checkpointBlockInfo.Error)
Expand Down Expand Up @@ -914,7 +924,8 @@ func NewBlockchain(
for _, keyStr := range trustedBlockProducerPublicKeyStrs {
pkBytes, _, err := Base58CheckDecode(keyStr)
if err != nil {
return nil, fmt.Errorf("Error decoding trusted block producer public key: %v", err)
return nil, fmt.Errorf("Error decoding trusted block producer "+
"public key: %v, %v", trustedBlockProducerPublicKeyStrs, err)
}
trustedBlockProducerPublicKeys[MakePkMapKey(pkBytes)] = true
}
Expand Down
45 changes: 25 additions & 20 deletions lib/connection_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,26 +159,31 @@ func NewConnectionManager(

// Check if the address passed shares a group with any addresses already in our data structures.
func (cmgr *ConnectionManager) IsFromRedundantOutboundIPAddress(na *wire.NetAddress) bool {
groupKey := addrmgr.GroupKey(na)
// For the sake of running multiple nodes on the same machine, we allow localhost connections.
if groupKey == "local" {
return false
}

cmgr.mtxOutboundConnIPGroups.Lock()
numGroupsForKey := cmgr.outboundConnIPGroups[groupKey]
cmgr.mtxOutboundConnIPGroups.Unlock()

if numGroupsForKey != 0 && numGroupsForKey != 1 {
glog.V(2).Infof("IsFromRedundantOutboundIPAddress: Found numGroupsForKey != (0 or 1). Is (%d) "+
"instead for addr (%s) and group key (%s). This "+
"should never happen.", numGroupsForKey, na.IP.String(), groupKey)
}

if numGroupsForKey == 0 {
return false
}
return true
return false
// TODO: Delete this code once we're 100% sure we don't need it.
// This group checking was required as an early DDOS prevention measure,
// but it is no longer required because most nodes have other ways of preventing attacks.

//groupKey := addrmgr.GroupKey(na)
//// For the sake of running multiple nodes on the same machine, we allow localhost connections.
//if groupKey == "local" {
// return false
//}
//
//cmgr.mtxOutboundConnIPGroups.Lock()
//numGroupsForKey := cmgr.outboundConnIPGroups[groupKey]
//cmgr.mtxOutboundConnIPGroups.Unlock()
//
//if numGroupsForKey != 0 && numGroupsForKey != 1 {
// glog.V(2).Infof("IsFromRedundantOutboundIPAddress: Found numGroupsForKey != (0 or 1). Is (%d) "+
// "instead for addr (%s) and group key (%s). This "+
// "should never happen.", numGroupsForKey, na.IP.String(), groupKey)
//}
//
//if numGroupsForKey == 0 {
// return false
//}
//return true
}

func (cmgr *ConnectionManager) AddToGroupKey(na *wire.NetAddress) {
Expand Down
4 changes: 2 additions & 2 deletions lib/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,7 @@ var DeSoMainnetParams = DeSoParams{
MinChainWorkHex: "000000000000000000000000000000000000000000000000006314f9a85a949b",

MaxTipAgePoW: 24 * time.Hour,
MaxTipAgePoS: time.Hour,
MaxTipAgePoS: 24 * time.Hour,

// ===================================================================================
// Mainnet Bitcoin config
Expand Down Expand Up @@ -1568,7 +1568,7 @@ var DeSoTestnetParams = DeSoParams{
// TODO: Set to one day when we launch the testnet. In the meantime this value
// is more useful for local testing.
MaxTipAgePoW: time.Hour * 24,
MaxTipAgePoS: time.Hour,
MaxTipAgePoS: time.Hour * 24,

// Difficulty can't decrease to below 50% of its previous value or increase
// to above 200% of its previous value.
Expand Down
7 changes: 6 additions & 1 deletion lib/network_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,8 @@ func (nm *NetworkManager) Disconnect(rn *RemoteNode, disconnectReason string) {
if rn == nil {
return
}
glog.V(2).Infof("NetworkManager.Disconnect: Disconnecting from remote node id=%v", rn.GetId())
glog.V(2).Infof("NetworkManager.Disconnect: Disconnecting from remote "+
"node id=%v for reason %v", rn.GetId(), disconnectReason)
rn.Disconnect(disconnectReason)
nm.removeRemoteNodeFromIndexer(rn)
}
Expand Down Expand Up @@ -1301,6 +1302,10 @@ func (nm *NetworkManager) handleHandshakeCompletePoSMessage(remoteNode *RemoteNo
existingValidator, ok := nm.GetValidatorOutboundIndex().Get(validatorPk.Serialize())
if ok && remoteNode.GetId() != existingValidator.GetId() {
if remoteNode.IsPersistent() && !existingValidator.IsPersistent() {
glog.Errorf("NetworkManager.handleHandshakeCompletePoSMessage: Outbound RemoteNode with duplicate validator public key. "+
"Existing validator id: %v, new validator id: %v, ip old: %v, ip new: %v",
existingValidator.GetId().ToUint64(), remoteNode.GetId().ToUint64(),
existingValidator.GetNetAddress(), remoteNode.GetNetAddress())
nm.Disconnect(existingValidator, "outbound - duplicate validator public key")
return nil
}
Expand Down
8 changes: 7 additions & 1 deletion lib/pos_consensus.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,11 @@ func (fc *FastHotStuffConsensus) HandleLocalTimeoutEvent(event *consensus.FastHo
// Broadcast the block to the validator network
validators := fc.networkManager.GetConnectedValidators()
for _, validator := range validators {
glog.V(2).Infof("FastHotStuffConsensus.HandleLocalTimeoutEvent: Broadcasting "+
"timeout msg %v to validator ID=%v pubkey=%v addr=%v",
timeoutMsg.ToString(),
validator.GetId(),
validator.validatorPublicKey.ToString(), validator.GetNetAddress())
sendMessageToRemoteNodeAsync(validator, timeoutMsg)
}

Expand All @@ -537,7 +542,8 @@ func (fc *FastHotStuffConsensus) HandleLocalTimeoutEvent(event *consensus.FastHo
// HandleValidatorTimeout is called when we receive a validator timeout message from a peer. This function
// processes the timeout locally in the FastHotStuffEventLoop.
func (fc *FastHotStuffConsensus) HandleValidatorTimeout(pp *Peer, msg *MsgDeSoValidatorTimeout) ([]*BlockHash, error) {
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s", msg.ToString())
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s [%v %v]", msg.ToString(),
msg.VotingPublicKey.ToString(), msg.TimedOutView)
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s", fc.fastHotStuffEventLoop.ToString())

// Hold a write lock on the consensus, since we need to update the timeout message in the
Expand Down
Loading

0 comments on commit 57373bf

Please sign in to comment.