Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ln/max tip age bug fix #1392

Merged
merged 33 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
cf6b388
max tip age fix
lazynina Jul 23, 2024
6e5c750
move to server.go
lazynina Jul 23, 2024
9e12445
start it
lazynina Jul 23, 2024
4bd3da6
hotfix
lazynina Jul 23, 2024
6e1832d
hotfix
lazynina Jul 23, 2024
4ff30d2
Add more logging
diamondhands0 Jul 23, 2024
6f2ffdb
Fix domain logging
diamondhands0 Jul 23, 2024
158a883
10x base duration
lazynina Jul 23, 2024
8aee1ef
down to 3x
lazynina Jul 23, 2024
e407011
revert
lazynina Jul 23, 2024
f34099f
more logging
diamondhands0 Jul 23, 2024
b79de43
Merge branch 'ln/max-tip-age-bug-fix' of https://github.com/deso-prot…
diamondhands0 Jul 23, 2024
01ba0c2
Improve logging and fix csv flag issue
diamondhands0 Jul 23, 2024
94439ce
Remove redundant outbound ip group check
diamondhands0 Jul 23, 2024
fc8ea8f
Fix flag
diamondhands0 Jul 23, 2024
a40c05f
Dont erase node ever...
diamondhands0 Jul 23, 2024
ca0e94d
Add logging
diamondhands0 Jul 23, 2024
ca996ab
Finally fix string slice bs
diamondhands0 Jul 23, 2024
7d6708d
ugh
diamondhands0 Jul 23, 2024
f2426c9
Reject unroutable addresses
diamondhands0 Jul 23, 2024
be7f2e7
Allow unroutable for inbound
diamondhands0 Jul 23, 2024
d67fd0c
Allow unroutable
diamondhands0 Jul 23, 2024
31fa104
hotfix
lazynina Jul 23, 2024
b893c1b
Merge branch 'ln/max-tip-age-bug-fix' of github.com:deso-protocol/cor…
lazynina Jul 23, 2024
aaef35c
hotfix 2
lazynina Jul 23, 2024
f92a9d8
cleanup
lazynina Jul 24, 2024
281d368
gofmt
lazynina Jul 24, 2024
6c2b1eb
address DH feedback
lazynina Jul 25, 2024
6b6b153
adjust max tip age pos
lazynina Jul 25, 2024
1ea1e6d
remove printAllActiveValidators
lazynina Jul 25, 2024
11bc4d4
24h for max tip age
lazynina Jul 25, 2024
8c97a24
address feedback
lazynina Jul 25, 2024
b9c8a6e
remove more logging
lazynina Jul 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/url"
"os"
"path/filepath"
"strings"
)

type Config struct {
Expand Down Expand Up @@ -86,6 +87,17 @@ type Config struct {
CheckpointSyncingProviders []string
}

// Viper doesn't work when you have environment variables. This is the
// suggested workaround:
// https://github.com/spf13/viper/issues/380
func GetStringSliceWorkaround(flagName string) []string {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I applied this thing pretty quickly, but I think we need to keep it generally so I'd leave it. If you don't have it then it literally can't process comma-separated values when you pass them as env vars.

value := viper.GetString(flagName)
if value == "" || value == " " {
return []string{}
}
return strings.Split(value, ",")
}

func LoadConfig() *Config {
config := Config{}

Expand Down Expand Up @@ -132,9 +144,10 @@ func LoadConfig() *Config {
config.TransactionValidationRefreshIntervalMillis = viper.GetUint64("transaction-validation-refresh-interval-millis")

// Peers
config.ConnectIPs = viper.GetStringSlice("connect-ips")
config.AddIPs = viper.GetStringSlice("add-ips")
config.AddSeeds = viper.GetStringSlice("add-seeds")
config.ConnectIPs = GetStringSliceWorkaround("connect-ips")
glog.V(2).Infof("Connect IPs read in: %v", config.ConnectIPs)
config.AddIPs = GetStringSliceWorkaround("add-ips")
config.AddSeeds = GetStringSliceWorkaround("add-seeds")
config.TargetOutboundPeers = viper.GetUint32("target-outbound-peers")
config.StallTimeoutSeconds = viper.GetUint64("stall-timeout-seconds")

Expand All @@ -150,7 +163,7 @@ func LoadConfig() *Config {
config.PeerConnectionRefreshIntervalMillis = viper.GetUint64("peer-connection-refresh-interval-millis")

// Mining + Admin
config.MinerPublicKeys = viper.GetStringSlice("miner-public-keys")
config.MinerPublicKeys = GetStringSliceWorkaround("miner-public-keys")
config.NumMiningThreads = viper.GetUint64("num-mining-threads")

// Fees
Expand All @@ -163,7 +176,9 @@ func LoadConfig() *Config {
config.BlockCypherAPIKey = viper.GetString("block-cypher-api-key")
config.BlockProducerSeed = viper.GetString("block-producer-seed")
config.TrustedBlockProducerStartHeight = viper.GetUint64("trusted-block-producer-start-height")
// TODO: Couldn't get this to work with environement variable
config.TrustedBlockProducerPublicKeys = viper.GetStringSlice("trusted-block-producer-public-keys")
glog.V(2).Infof("Trusted Block Producer Public Keys: %v", config.TrustedBlockProducerPublicKeys)

// Logging
config.LogDirectory = viper.GetString("log-dir")
Expand All @@ -181,7 +196,7 @@ func LoadConfig() *Config {
config.StateSyncerMempoolTxnSyncLimit = viper.GetUint64("state-syncer-mempool-txn-sync-limit")

// PoS Checkpoint Syncing
config.CheckpointSyncingProviders = viper.GetStringSlice("checkpoint-syncing-providers")
config.CheckpointSyncingProviders = GetStringSliceWorkaround("checkpoint-syncing-providers")
for _, provider := range config.CheckpointSyncingProviders {
if _, err := url.ParseRequestURI(provider); err != nil {
glog.Fatalf("Invalid checkpoint syncing provider URL: %v", provider)
Expand Down
18 changes: 11 additions & 7 deletions cmd/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (node *Node) Start(exitChannels ...*chan struct{}) {
// records to the DB. In this case, the snapshot is corrupted and needs to be computed. See the
// comment at the top of snapshot.go for more information on how this works.
if shouldRestart {
glog.Infof(lib.CLog(lib.Red, fmt.Sprintf("Start: Got en error while starting server and shouldRestart "+
glog.Infof(lib.CLog(lib.Red, fmt.Sprintf("Start: Got an error while starting server and shouldRestart "+
"is true. Node will be erased and resynced. Error: (%v)", err)))
node.nodeMessageChan <- lib.NodeErase
return
Expand Down Expand Up @@ -383,12 +383,16 @@ func (node *Node) listenToNodeMessages(exitChannels ...*chan struct{}) {
glog.Infof("Node.listenToNodeMessages: Finished stopping node")
switch operation {
case lib.NodeErase:
if err := os.RemoveAll(node.Config.DataDirectory); err != nil {
glog.Fatal(lib.CLog(lib.Red, fmt.Sprintf("IMPORTANT: Problem removing the directory (%v), you "+
"should run `rm -rf %v` to delete it manually. Error: (%v)", node.Config.DataDirectory,
node.Config.DataDirectory, err)))
return
}
glog.Error("Not actually erasing node")
lazynina marked this conversation as resolved.
Show resolved Hide resolved
// TODO: Clean up this path. This NodeErase code was added when we upgraded to HyperSync,
// but it's not worth compromising the node if a false positive sends us here.

//if err := os.RemoveAll(node.Config.DataDirectory); err != nil {
// glog.Fatal(lib.CLog(lib.Red, fmt.Sprintf("IMPORTANT: Problem removing the directory (%v), you "+
// "should run `rm -rf %v` to delete it manually. Error: (%v)", node.Config.DataDirectory,
// node.Config.DataDirectory, err)))
// return
//}
}

glog.Infof("Node.listenToNodeMessages: Restarting node")
Expand Down
27 changes: 27 additions & 0 deletions consensus/event_loop.go
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,14 @@ func (fe *fastHotStuffEventLoop) tryConstructVoteQCInCurrentView() *FastHotStuff
// Fetch the validator list at the tip.
validatorList := fe.tip.validatorList

for _, validatorItem := range validatorList {
glog.V(2).Infof("Validator: Key: %v, Stake: %v, Domains: %v",
validatorItem.GetPublicKey().ToString(),
validatorItem.GetStakeAmount().ToBig().String(),
DomainsToString(validatorItem.GetDomains()),
)
}

// Compute the chain tip's signature payload.
voteSignaturePayload := GetVoteSignaturePayload(tipBlock.GetView(), tipBlock.GetBlockHash())

Expand Down Expand Up @@ -711,6 +719,10 @@ func (fe *fastHotStuffEventLoop) tryConstructVoteQCInCurrentView() *FastHotStuff
signatures = append(signatures, vote.GetSignature())
}

glog.V(2).Infof("FastHotStuffEventLoop.tryConstructVoteQCInCurrentView: "+
"Total Stake: %s, Total Voting Stake: %s, Signers List: %v, Num Signatures: %d",
totalStake.ToBig().String(), totalVotingStake.ToBig().String(), signersList, len(signatures))

// If we don't have a super-majority vote for the chain tip, then we can't build a QC.
if !isSuperMajorityStake(totalVotingStake, totalStake) {
return nil
Expand Down Expand Up @@ -756,6 +768,17 @@ func (fe *fastHotStuffEventLoop) tryConstructTimeoutQCInCurrentView() *FastHotSt
// proposed in the next view. So if we want to propose a timeout QC in the current view, we need
// to aggregate timeouts from the previous one.
timeoutsByValidator := fe.timeoutsSeenByView[fe.currentView-1]
glog.V(2).Infof("FastHotStuffEventLoop.tryConstructTimeoutQCInCurrentView: " +
"Printing timeouts by validator: ")
for key := range timeoutsByValidator {
validatorItem, exists := fe.tip.validatorLookup[key]
if !exists {
glog.V(2).Infof("Validator not found for key %v", key)
continue
}
glog.V(2).Infof("Validator: Key: %v, Stake: %v, Domains: %v",
key, validatorItem.GetStakeAmount().ToBig().String(), DomainsToString(validatorItem.GetDomains()))
}

// Tracks the highQC from validators as we go along.
var validatorsHighQC QuorumCertificate
Expand Down Expand Up @@ -827,6 +850,10 @@ func (fe *fastHotStuffEventLoop) tryConstructTimeoutQCInCurrentView() *FastHotSt
highQCViews[ii] = timeout.GetHighQC().GetView()
}

glog.V(2).Infof("FastHotStuffEventLoop.tryConstructTimeoutQCInCurrentView: "+
"totalStake: %s, totalTimedOutStake: %s",
totalStake.ToBig().String(), totalTimedOutStake.ToBig().String())

// Check if we have a super majority of stake that has timed out
if !isSuperMajorityStake(totalTimedOutStake, totalStake) {
return nil
Expand Down
21 changes: 20 additions & 1 deletion consensus/integration_test_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package consensus

import (
"github.com/golang/glog"
"sync"
"time"

Expand Down Expand Up @@ -110,6 +111,18 @@ func (node *validatorNode) GetDomains() [][]byte {
return [][]byte{}
}

func DomainsToString(domainsBytes [][]byte) string {
domains := ""
for _, domain := range domainsBytes {
domains += string(domain) + ", "
}
return domains
}

func (node *validatorNode) GetDomainsString() string {
return DomainsToString(node.GetDomains())
}

func (node *validatorNode) ProcessBlock(incomingBlock *block) {
node.lock.Lock()
defer node.lock.Unlock()
Expand Down Expand Up @@ -313,7 +326,10 @@ func (node *validatorNode) ProcessTimeout(timeout TimeoutMessage) {
return
}

node.eventLoop.ProcessValidatorTimeout(timeout)
if err := node.eventLoop.ProcessValidatorTimeout(timeout); err != nil {
glog.V(2).Infof("ProcessTimeout: Error processing timeout from validator %v: %v",
timeout.GetPublicKey().ToString(), err)
}
}

func (node *validatorNode) Start() {
Expand Down Expand Up @@ -400,6 +416,9 @@ func (node *validatorNode) broadcastTimeout(event *FastHotStuffEvent) {

// Broadcast the block to all validators.
for _, validator := range node.validatorNodes {
glog.V(2).Infof("broadcastTimeout: Broadcasting timeout message from validator "+
"%v (%v) to validator %v (%v)", node.GetDomainsString(), node.GetPublicKey().ToString(),
validator.GetDomainsString(), validator.GetPublicKey().ToString())
go validator.ProcessTimeout(timeout)
}
}
Expand Down
15 changes: 13 additions & 2 deletions lib/blockchain.go
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,16 @@ type Blockchain struct {
timer *Timer
}

func (bc *Blockchain) getHighestCheckpointView() uint64 {
bc.checkpointBlockInfoLock.RLock()
defer bc.checkpointBlockInfoLock.RUnlock()

if bc.checkpointBlockInfo != nil {
return bc.checkpointBlockInfo.LatestView
}
return uint64(0)
}

func (bc *Blockchain) updateCheckpointBlockInfo() {
if len(bc.checkpointSyncingProviders) == 0 {
glog.V(2).Info("updateCheckpointBlockInfo: No checkpoint syncing providers set. Skipping update.")
Expand All @@ -580,7 +590,7 @@ func (bc *Blockchain) updateCheckpointBlockInfo() {
// from the checkpoint syncing providers. We'll combine these two pieces of information to
// form the final checkpoint block info.
var highestHeightCheckpointBlockInfo *CheckpointBlockInfo
highestView := uint64(0)
highestView := bc.getHighestCheckpointView()
for _, checkpointBlockInfo := range checkpointBlockInfos {
if checkpointBlockInfo.Error != nil {
glog.Errorf("updateCheckpointBlockInfo: Error getting checkpoint block info: %v", checkpointBlockInfo.Error)
Expand Down Expand Up @@ -914,7 +924,8 @@ func NewBlockchain(
for _, keyStr := range trustedBlockProducerPublicKeyStrs {
pkBytes, _, err := Base58CheckDecode(keyStr)
if err != nil {
return nil, fmt.Errorf("Error decoding trusted block producer public key: %v", err)
return nil, fmt.Errorf("Error decoding trusted block producer "+
"public key: %v, %v", trustedBlockProducerPublicKeyStrs, err)
}
trustedBlockProducerPublicKeys[MakePkMapKey(pkBytes)] = true
}
Expand Down
45 changes: 25 additions & 20 deletions lib/connection_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,26 +159,31 @@ func NewConnectionManager(

// Check if the address passed shares a group with any addresses already in our data structures.
func (cmgr *ConnectionManager) IsFromRedundantOutboundIPAddress(na *wire.NetAddress) bool {
groupKey := addrmgr.GroupKey(na)
// For the sake of running multiple nodes on the same machine, we allow localhost connections.
if groupKey == "local" {
return false
}

cmgr.mtxOutboundConnIPGroups.Lock()
numGroupsForKey := cmgr.outboundConnIPGroups[groupKey]
cmgr.mtxOutboundConnIPGroups.Unlock()

if numGroupsForKey != 0 && numGroupsForKey != 1 {
glog.V(2).Infof("IsFromRedundantOutboundIPAddress: Found numGroupsForKey != (0 or 1). Is (%d) "+
"instead for addr (%s) and group key (%s). This "+
"should never happen.", numGroupsForKey, na.IP.String(), groupKey)
}

if numGroupsForKey == 0 {
return false
}
return true
return false
// TODO: Delete this code once we're 100% sure we don't need it.
// This group checking was required as an early DDOS prevention measure,
// but it is no longer required because most nodes have other ways of preventing attacks.

//groupKey := addrmgr.GroupKey(na)
lazynina marked this conversation as resolved.
Show resolved Hide resolved
//// For the sake of running multiple nodes on the same machine, we allow localhost connections.
//if groupKey == "local" {
// return false
//}
//
//cmgr.mtxOutboundConnIPGroups.Lock()
//numGroupsForKey := cmgr.outboundConnIPGroups[groupKey]
//cmgr.mtxOutboundConnIPGroups.Unlock()
//
//if numGroupsForKey != 0 && numGroupsForKey != 1 {
// glog.V(2).Infof("IsFromRedundantOutboundIPAddress: Found numGroupsForKey != (0 or 1). Is (%d) "+
// "instead for addr (%s) and group key (%s). This "+
// "should never happen.", numGroupsForKey, na.IP.String(), groupKey)
//}
//
//if numGroupsForKey == 0 {
// return false
//}
//return true
}

func (cmgr *ConnectionManager) AddToGroupKey(na *wire.NetAddress) {
Expand Down
4 changes: 2 additions & 2 deletions lib/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,7 @@ var DeSoMainnetParams = DeSoParams{
MinChainWorkHex: "000000000000000000000000000000000000000000000000006314f9a85a949b",

MaxTipAgePoW: 24 * time.Hour,
MaxTipAgePoS: time.Hour,
MaxTipAgePoS: 24 * time.Hour,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok so I have a question: If we're syncing from scratch, do we use MaxTipAgePoS to mark when we start checking block signatures? Because that would make sync take a lot longer unnecessarily. If this is NOT the case, then I like 24h as the time. If it IS the case, then maybe just do 3h or something more conservative until we can break that signature checking thing into its own flag...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll lower to 3 hours - this does make us start checking signatures sooner.


// ===================================================================================
// Mainnet Bitcoin config
Expand Down Expand Up @@ -1544,7 +1544,7 @@ var DeSoTestnetParams = DeSoParams{
// TODO: Set to one day when we launch the testnet. In the meantime this value
// is more useful for local testing.
MaxTipAgePoW: time.Hour * 24,
MaxTipAgePoS: time.Hour,
MaxTipAgePoS: time.Hour * 24,

// Difficulty can't decrease to below 50% of its previous value or increase
// to above 200% of its previous value.
Expand Down
11 changes: 10 additions & 1 deletion lib/network_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,11 @@ func (nm *NetworkManager) startNonValidatorConnector() {
nm.exitGroup.Done()
return
case <-time.After(nm.peerConnectionRefreshInterval):
glog.V(2).Infof("NetworkManager.startNonValidatorConnector: Refreshing NON validator indices...")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry delete this and the one below it as well. Don't want to have any logging in this select.

nm.refreshNonValidatorOutboundIndex()
nm.refreshNonValidatorInboundIndex()
nm.connectNonValidators()
glog.V(2).Infof("NetworkManager.startNonValidatorConnector: Finished refreshing NON validator indices...")
}
}
}
Expand All @@ -227,7 +229,9 @@ func (nm *NetworkManager) startRemoteNodeCleanup() {
nm.exitGroup.Done()
return
case <-time.After(nm.peerConnectionRefreshInterval):
glog.V(2).Infof("NetworkManager.startRemoteNodeCleanup: Starting remote node cleanup...")
nm.Cleanup()
glog.V(2).Infof("NetworkManager.startRemoteNodeCleanup: Finished remote node cleanup...")
}
}

Expand Down Expand Up @@ -976,7 +980,8 @@ func (nm *NetworkManager) Disconnect(rn *RemoteNode, disconnectReason string) {
if rn == nil {
return
}
glog.V(2).Infof("NetworkManager.Disconnect: Disconnecting from remote node id=%v", rn.GetId())
glog.V(2).Infof("NetworkManager.Disconnect: Disconnecting from remote "+
lazynina marked this conversation as resolved.
Show resolved Hide resolved
"node id=%v for reason %v", rn.GetId(), disconnectReason)
rn.Disconnect(disconnectReason)
nm.removeRemoteNodeFromIndexer(rn)
}
Expand Down Expand Up @@ -1301,6 +1306,10 @@ func (nm *NetworkManager) handleHandshakeCompletePoSMessage(remoteNode *RemoteNo
existingValidator, ok := nm.GetValidatorOutboundIndex().Get(validatorPk.Serialize())
if ok && remoteNode.GetId() != existingValidator.GetId() {
if remoteNode.IsPersistent() && !existingValidator.IsPersistent() {
glog.Errorf("NetworkManager.handleHandshakeCompletePoSMessage: Outbound RemoteNode with duplicate validator public key. "+
"Existing validator id: %v, new validator id: %v, ip old: %v, ip new: %v",
existingValidator.GetId().ToUint64(), remoteNode.GetId().ToUint64(),
existingValidator.GetNetAddress(), remoteNode.GetNetAddress())
nm.Disconnect(existingValidator, "outbound - duplicate validator public key")
return nil
}
Expand Down
8 changes: 7 additions & 1 deletion lib/pos_consensus.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,11 @@ func (fc *FastHotStuffConsensus) HandleLocalTimeoutEvent(event *consensus.FastHo
// Broadcast the block to the validator network
validators := fc.networkManager.GetConnectedValidators()
for _, validator := range validators {
glog.V(2).Infof("FastHotStuffConsensus.HandleLocalTimeoutEvent: Broadcasting "+
"timeout msg %v to validator ID=%v pubkey=%v addr=%v",
timeoutMsg.ToString(),
validator.GetId(),
validator.validatorPublicKey.ToString(), validator.GetNetAddress())
sendMessageToRemoteNodeAsync(validator, timeoutMsg)
}

Expand All @@ -537,7 +542,8 @@ func (fc *FastHotStuffConsensus) HandleLocalTimeoutEvent(event *consensus.FastHo
// HandleValidatorTimeout is called when we receive a validator timeout message from a peer. This function
// processes the timeout locally in the FastHotStuffEventLoop.
func (fc *FastHotStuffConsensus) HandleValidatorTimeout(pp *Peer, msg *MsgDeSoValidatorTimeout) ([]*BlockHash, error) {
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s", msg.ToString())
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s [%v %v]", msg.ToString(),
msg.VotingPublicKey.ToString(), msg.TimedOutView)
glog.V(2).Infof("FastHotStuffConsensus.HandleValidatorTimeout: %s", fc.fastHotStuffEventLoop.ToString())

// Hold a write lock on the consensus, since we need to update the timeout message in the
Expand Down
Loading
Loading