Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hotfix(el/cl): allow multi clients to start if at least one node is up #2000

Merged
merged 44 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
4a7ed06
do not crash if one client fails version check
y0sher Jan 22, 2025
1784907
fix help note on multiple addresses
y0sher Jan 22, 2025
6139748
don't compare genensis values
y0sher Jan 22, 2025
54f982f
remove old assertSameGenesis code
y0sher Jan 22, 2025
7a4248d
beacon/goclient: set up connection hooks, assert genesis
nkryuchkov Jan 22, 2025
1025641
beacon/goclient: remove outdated comment
nkryuchkov Jan 23, 2025
af5c455
eth/executionclient: allow starting with unhealthy client
nkryuchkov Jan 23, 2025
3b9efcf
eth/executionclient: simplify mutex usage
nkryuchkov Jan 23, 2025
c6592ad
eth/executionclient: fix tests for assertSameChainID
nkryuchkov Jan 23, 2025
b21c110
eth/executionclient: improve comment for assertSameGenesis
nkryuchkov Jan 23, 2025
902c99a
handle nil genesis and chain ID responses
nkryuchkov Jan 23, 2025
13edb6a
Merge branch 'stage' into fix/multiclient-oneclient
nkryuchkov Jan 24, 2025
e1cc2e3
refactor(multi_client): replace connectedCount atomic int with bool. …
y0sher Jan 26, 2025
2bdc1ac
clarify comment
nkryuchkov Jan 27, 2025
8db421f
fix double mutex lock
nkryuchkov Jan 27, 2025
0dd1027
fix potential nil pointer dereference
nkryuchkov Jan 27, 2025
8452bf4
check only genesis fork version instead of whole genesis
nkryuchkov Jan 27, 2025
d5fed41
create getClient helper method
nkryuchkov Jan 27, 2025
75e2753
improve logs
nkryuchkov Jan 27, 2025
999608b
fix issue with log fields
nkryuchkov Jan 27, 2025
66d5051
atomic chain ID
nkryuchkov Jan 27, 2025
61874f7
fix comment for client mutexes
nkryuchkov Jan 27, 2025
70594e8
no nil assignment in Close
nkryuchkov Jan 27, 2025
3cc71cd
add panic hook
nkryuchkov Jan 27, 2025
7dd5aa0
attempt to fix panic
nkryuchkov Jan 27, 2025
ab91668
fix logging
nkryuchkov Jan 27, 2025
99405c9
iterate clients forever in StreamLogs
nkryuchkov Jan 27, 2025
768665d
set follow distance to 1
nkryuchkov Jan 27, 2025
6af3463
Revert "set follow distance to 1"
nkryuchkov Jan 27, 2025
e9f5c7b
delete healthy channel
nkryuchkov Jan 27, 2025
f093561
remove obsolete tests
nkryuchkov Jan 28, 2025
7b3a1f5
add successful call log
nkryuchkov Jan 28, 2025
16dc397
fix potential nil ptr dereference
nkryuchkov Jan 28, 2025
95af801
go-eth2-client: use fork with sync distance tolerance
nkryuchkov Jan 28, 2025
411f482
add a comment about using github.com/nkryuchkov/go-eth2-client
nkryuchkov Jan 28, 2025
cbd61f5
code review comments
nkryuchkov Jan 28, 2025
f8188c6
named consensus client logger
nkryuchkov Jan 29, 2025
5b72d6a
remove trace logs
nkryuchkov Jan 29, 2025
77ca3cf
fix a typo
nkryuchkov Jan 29, 2025
0406ad3
add a comment with execution client shutdown scenarios
nkryuchkov Jan 30, 2025
23d393d
improve the comment for the call method
nkryuchkov Jan 30, 2025
85779ec
Merge branch 'stage' into fix/multiclient-oneclient
nkryuchkov Jan 30, 2025
c890973
log client address on submissions
nkryuchkov Jan 30, 2025
4751783
improve logs
nkryuchkov Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 121 additions & 82 deletions beacon/goclient/goclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ type GoClient struct {
clients []Client
multiClient MultiClient

genesisMu sync.Mutex
genesis *apiv1.Genesis

syncDistanceTolerance phase0.Slot
nodeSyncingFn func(ctx context.Context, opts *api.NodeSyncingOpts) (*api.Response[*apiv1.SyncState], error)

Expand Down Expand Up @@ -154,82 +157,41 @@ func New(
longTimeout = DefaultLongTimeout
}

client := &GoClient{
log: logger,
ctx: opt.Context,
network: opt.Network,
syncDistanceTolerance: phase0.Slot(opt.SyncDistanceTolerance),
operatorDataStore: operatorDataStore,
registrationCache: map[phase0.BLSPubKey]*api.VersionedSignedValidatorRegistration{},
attestationDataCache: ttlcache.New(
// we only fetch attestation data during the slot of the relevant duty (and never later),
// hence caching it for 2 slots is sufficient
ttlcache.WithTTL[phase0.Slot, *phase0.AttestationData](2 * opt.Network.SlotDurationSec()),
),
commonTimeout: commonTimeout,
longTimeout: longTimeout,
}

beaconAddrList := strings.Split(opt.BeaconNodeAddr, ";") // TODO: Decide what symbol to use as a separator. Bootnodes are currently separated by ";". Deployment bot currently uses ",".
if len(beaconAddrList) == 0 {
return nil, fmt.Errorf("no beacon node address provided")
}

var consensusClients []Client
var consensusClientsAsServices []eth2client.Service
for _, beaconAddr := range beaconAddrList {
httpClient, err := setupHTTPClient(opt.Context, logger, beaconAddr, commonTimeout)
if err != nil {
if err := client.addSingleClient(opt.Context, beaconAddr); err != nil {
return nil, err
}

nodeVersionResp, err := httpClient.NodeVersion(opt.Context, &api.NodeVersionOpts{})
if err != nil {
logger.Error(clResponseErrMsg,
zap.String("api", "NodeVersion"),
zap.Error(err),
)
return nil, fmt.Errorf("failed to get node version: %w", err)
}
if nodeVersionResp == nil {
logger.Error(clNilResponseErrMsg,
zap.String("api", "NodeVersion"),
)
return nil, fmt.Errorf("node version response is nil")
}

logger.Info("consensus client connected",
fields.Name(httpClient.Name()),
fields.Address(httpClient.Address()),
zap.String("client", string(ParseNodeClient(nodeVersionResp.Data))),
zap.String("version", nodeVersionResp.Data),
)

consensusClients = append(consensusClients, httpClient)
consensusClientsAsServices = append(consensusClientsAsServices, httpClient)
}

err := assertSameGenesis(opt.Context, consensusClients...)
if err != nil {
return nil, fmt.Errorf("assert same spec: %w", err)
}

multiClient, err := eth2clientmulti.New(
opt.Context,
eth2clientmulti.WithClients(consensusClientsAsServices),
eth2clientmulti.WithLogLevel(zerolog.DebugLevel),
eth2clientmulti.WithTimeout(commonTimeout),
)
err := client.initMultiClient(opt.Context)
if err != nil {
logger.Error("Consensus multi client initialization failed",
zap.String("address", opt.BeaconNodeAddr),
zap.Error(err),
)

return nil, fmt.Errorf("create multi client: %w", err)
}
consensusClient := multiClient.(*eth2clientmulti.Service)

client := &GoClient{
log: logger,
ctx: opt.Context,
network: opt.Network,
clients: consensusClients,
multiClient: consensusClient,
syncDistanceTolerance: phase0.Slot(opt.SyncDistanceTolerance),
operatorDataStore: operatorDataStore,
registrationCache: map[phase0.BLSPubKey]*api.VersionedSignedValidatorRegistration{},
attestationDataCache: ttlcache.New(
// we only fetch attestation data during the slot of the relevant duty (and never later),
// hence caching it for 2 slots is sufficient
ttlcache.WithTTL[phase0.Slot, *phase0.AttestationData](2 * opt.Network.SlotDurationSec()),
),
commonTimeout: commonTimeout,
longTimeout: longTimeout,
return nil, err
}

client.nodeSyncingFn = client.nodeSyncing
Expand All @@ -241,55 +203,132 @@ func New(
return client, nil
}

func setupHTTPClient(ctx context.Context, logger *zap.Logger, addr string, commonTimeout time.Duration) (*eth2clienthttp.Service, error) {
func (gc *GoClient) initMultiClient(ctx context.Context) error {
var services []eth2client.Service
for _, client := range gc.clients {
services = append(services, client)
}

multiClient, err := eth2clientmulti.New(
ctx,
eth2clientmulti.WithClients(services),
eth2clientmulti.WithLogLevel(zerolog.DebugLevel),
eth2clientmulti.WithTimeout(gc.commonTimeout),
)
if err != nil {
return fmt.Errorf("create multi client: %w", err)
}

gc.multiClient = multiClient.(*eth2clientmulti.Service)
return nil
}

func (gc *GoClient) addSingleClient(ctx context.Context, addr string) error {
httpClient, err := eth2clienthttp.New(
ctx,
// WithAddress supplies the address of the beacon node, in host:port format.
eth2clienthttp.WithAddress(addr),
// LogLevel supplies the level of logging to carry out.
eth2clienthttp.WithLogLevel(zerolog.DebugLevel),
eth2clienthttp.WithTimeout(commonTimeout),
eth2clienthttp.WithTimeout(gc.commonTimeout),
eth2clienthttp.WithReducedMemoryUsage(true),
eth2clienthttp.WithAllowDelayedStart(true),
eth2clienthttp.WithHooks(gc.singleClientHooks()),
)
if err != nil {
logger.Error("Consensus http client initialization failed",
gc.log.Error("Consensus http client initialization failed",
zap.String("address", addr),
zap.Error(err),
)

return nil, fmt.Errorf("create http client: %w", err)
return fmt.Errorf("create http client: %w", err)
Comment on lines -256 to +245
Copy link
Contributor

@iurii-ssv iurii-ssv Jan 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really specific to this PR but we often (and here as well) do 2 "duplicate" things

  • log error
  • return that same error (that's always gonna be logged eventually by the caller resulting in roughly duplicate log-line)

maybe would be simpler to just return error (with formatted with fmt.Errorf to provide the necessary context) in places like this, bringing it up so we can get on the same page (whether we want to keep an eye on things like this or not)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main difference and issue is that when logging with zap and adding fields its easy to search them by the label value. we'll need to squeeze everything to the fmt.Errorf, it'll still be searchable, but not by label.

Copy link
Contributor

@nkryuchkov nkryuchkov Jan 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that we could think more about our logging approach, but I think this package currently uses logging in a way similar to other packages. If we decide to improve logging (e.g. use custom error types with fields), we need to do it project-wide

}

return httpClient.(*eth2clienthttp.Service), nil
gc.clients = append(gc.clients, httpClient.(*eth2clienthttp.Service))

return nil
}

// assertSameGenesis should receive a non-empty list
func assertSameGenesis(ctx context.Context, services ...Client) error {
firstGenesis, err := services[0].Genesis(ctx, &api.GenesisOpts{})
if err != nil {
return fmt.Errorf("get first genesis: %w", err)
func (gc *GoClient) singleClientHooks() *eth2clienthttp.Hooks {
return &eth2clienthttp.Hooks{
OnActive: func(ctx context.Context, s *eth2clienthttp.Service) {
// If err is nil, nodeVersionResp is never nil.
iurii-ssv marked this conversation as resolved.
Show resolved Hide resolved
nodeVersionResp, err := s.NodeVersion(ctx, &api.NodeVersionOpts{})
if err != nil {
gc.log.Error(clResponseErrMsg,
zap.String("address", s.Address()),
zap.String("api", "NodeVersion"),
zap.Error(err),
)
return
}

gc.log.Info("consensus client connected",
fields.Name(s.Name()),
fields.Address(s.Address()),
zap.String("client", string(ParseNodeClient(nodeVersionResp.Data))),
zap.String("version", nodeVersionResp.Data),
)

genesis, err := s.Genesis(ctx, &api.GenesisOpts{})
if err != nil {
gc.log.Error(clResponseErrMsg,
zap.String("address", s.Address()),
zap.String("api", "Genesis"),
zap.Error(err),
)
return
}

if err := gc.assertSameGenesis(genesis.Data); err != nil {
gc.genesisMu.Lock()
defer gc.genesisMu.Unlock()
gc.log.Fatal("client genesis differs",
zap.String("address", s.Address()),
zap.Any("client_genesis", genesis.Data),
zap.Any("expected_genesis", gc.genesis),
)
return // Tests may override Fatal's behavior
}
},
OnInactive: func(ctx context.Context, s *eth2clienthttp.Service) {
gc.log.Warn("consensus client disconnected",
fields.Name(s.Name()),
fields.Address(s.Address()),
)
},
OnSynced: func(ctx context.Context, s *eth2clienthttp.Service) {
gc.log.Info("consensus client synced",
fields.Name(s.Name()),
fields.Address(s.Address()),
)
},
OnDesynced: func(ctx context.Context, s *eth2clienthttp.Service) {
gc.log.Warn("consensus client desynced",
fields.Name(s.Name()),
fields.Address(s.Address()),
)
},
}
}

for _, service := range services[1:] {
srvGenesis, err := service.Genesis(ctx, &api.GenesisOpts{})
if err != nil {
return fmt.Errorf("get service genesis: %w", err)
}
func (gc *GoClient) assertSameGenesis(genesis *apiv1.Genesis) error {
gc.genesisMu.Lock()
defer gc.genesisMu.Unlock()

if err := sameGenesis(firstGenesis.Data, srvGenesis.Data); err != nil {
return fmt.Errorf("different genesis: %w", err)
}
if gc.genesis == nil {
gc.genesis = genesis
return nil
}

if err := sameGenesis(gc.genesis, genesis); err != nil {
return fmt.Errorf("different genesis: %w", err)
}

return nil
}

func sameGenesis(a, b *apiv1.Genesis) error {
if a == nil || b == nil { // Input parameters should never be nil, so the check may fail if both are nil
return fmt.Errorf("genesis is nil")
}

if !a.GenesisTime.Equal(b.GenesisTime) {
return fmt.Errorf("genesis time mismatch, got %v and %v", a.GenesisTime, b.GenesisTime)
}
Expand Down
2 changes: 1 addition & 1 deletion beacon/goclient/goclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func TestTimeouts(t *testing.T) {
return nil
})
_, err := mockClient(ctx, undialableServer.URL, commonTimeout, longTimeout)
require.ErrorContains(t, err, "context deadline exceeded")
require.ErrorContains(t, err, "client is not active")
}

// Too slow to respond to the Validators request.
Expand Down
2 changes: 1 addition & 1 deletion eth/executionclient/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (

// ExecutionOptions contains config configurations related to Ethereum execution client.
type ExecutionOptions struct {
Addr string `yaml:"ETH1Addr" env:"ETH_1_ADDR" env-required:"true" env-description:"Execution client WebSocket address. Supports multiple comma-separated addresses"`
Addr string `yaml:"ETH1Addr" env:"ETH_1_ADDR" env-required:"true" env-description:"Execution client WebSocket address. Supports multiple semicolon separated addresses. ex: ws://localhost:8546;ws://localhost:8547"`
ConnectionTimeout time.Duration `yaml:"ETH1ConnectionTimeout" env:"ETH_1_CONNECTION_TIMEOUT" env-default:"10s" env-description:"Execution client connection timeout"`
SyncDistanceTolerance uint64 `yaml:"ETH1SyncDistanceTolerance" env:"ETH_1_SYNC_DISTANCE_TOLERANCE" env-default:"5" env-description:"The number of out-of-sync blocks we can tolerate"`
}
Loading
Loading