From b2ce6a260b470de417675b9496b169058bc3e232 Mon Sep 17 00:00:00 2001 From: Tolya Korniltsev Date: Thu, 29 Aug 2024 13:16:08 +0200 Subject: [PATCH] fix(metastore): local raft server id (#3530) {Suffrage:Voter ID:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local. Address:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099} {Suffrage:Voter ID:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099 Address:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099}]" - fix bootstrap of metastore with correct server id - add bootstrap retries --- pkg/experiment/metastore/metastore.go | 5 +++ .../metastore/metastore_bootstrap.go | 41 ++++++++++++++++--- .../values-micro-services-experiment.yaml | 2 +- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/pkg/experiment/metastore/metastore.go b/pkg/experiment/metastore/metastore.go index 81db5a9687..4665ab8c80 100644 --- a/pkg/experiment/metastore/metastore.go +++ b/pkg/experiment/metastore/metastore.go @@ -13,6 +13,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/dns" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/grpcclient" "github.com/grafana/dskit/services" @@ -130,6 +131,8 @@ type Metastore struct { metrics *metastoreMetrics client *metastoreclient.Client readySince time.Time + + dnsProvider *dns.Provider } type Limits interface{} @@ -226,6 +229,8 @@ func (m *Metastore) initRaft() (err error) { if err = m.bootstrap(); err != nil { return fmt.Errorf("failed to bootstrap cluster: %w", err) } + } else { + _ = level.Info(m.logger).Log("msg", "restoring existing state, not bootstraping") } m.leaderhealth.Register(m.raft, metastoreRaftLeaderHealthServiceName) diff --git a/pkg/experiment/metastore/metastore_bootstrap.go b/pkg/experiment/metastore/metastore_bootstrap.go index 00db8fe508..3d8cba9793 100644 --- a/pkg/experiment/metastore/metastore_bootstrap.go +++ b/pkg/experiment/metastore/metastore_bootstrap.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "github.com/grafana/dskit/backoff" "net" "slices" "strings" @@ -16,7 +17,7 @@ import ( ) func (m *Metastore) bootstrap() error { - peers, err := m.bootstrapPeers() + peers, err := m.bootstrapPeersWithRetries() if err != nil { return fmt.Errorf("failed to resolve peers: %w", err) } @@ -66,11 +67,13 @@ func (m *Metastore) bootstrapPeers() ([]raft.Server, error) { if len(resolve) > 0 { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - prov := dns.NewProvider(m.logger, m.reg, dns.MiekgdnsResolverType) - if err := prov.Resolve(ctx, resolve); err != nil { + if m.dnsProvider == nil { + m.dnsProvider = dns.NewProvider(m.logger, m.reg, dns.MiekgdnsResolverType) + } + if err := m.dnsProvider.Resolve(ctx, resolve); err != nil { return nil, fmt.Errorf("failed to resolve bootstrap peers: %w", err) } - resolvedPeers := prov.Addresses() + resolvedPeers := m.dnsProvider.Addresses() if len(resolvedPeers) == 0 { // The local node is the only one in the cluster, but peers // were supposed to be present. Stop here to avoid bootstrapping @@ -95,8 +98,8 @@ func (m *Metastore) bootstrapPeers() ([]raft.Server, error) { return a.ID == b.ID }) if len(peers) != m.config.Raft.BootstrapExpectPeers { - return nil, fmt.Errorf("expected number of bootstrap peers not reached: got %d, expected %d", - len(peers), m.config.Raft.BootstrapExpectPeers) + return nil, fmt.Errorf("expected number of bootstrap peers not reached: got %d, expected %d\n%+v", + len(peers), m.config.Raft.BootstrapExpectPeers, peers) } return peers, nil } @@ -127,3 +130,29 @@ func parsePeer(raw string) raft.Server { Address: raft.ServerAddress(addr), } } + +func (m *Metastore) bootstrapPeersWithRetries() (peers []raft.Server, err error) { + attempt := func() bool { + peers, err = m.bootstrapPeers() + level.Debug(m.logger).Log("msg", "resolving bootstrap peers", "peers", fmt.Sprint(peers), "err", err) + if err != nil { + _ = level.Error(m.logger).Log("msg", "failed to resolve bootstrap peers", "err", err) + return false + } + return true + } + backoffConfig := backoff.Config{ + MinBackoff: 1 * time.Second, + MaxBackoff: 10 * time.Second, + MaxRetries: 20, + } + backoff := backoff.New(context.Background(), backoffConfig) + for backoff.Ongoing() { + if !attempt() { + backoff.Wait() + } else { + return peers, nil + } + } + return nil, fmt.Errorf("failed to resolve bootstrap peers after %d retries %w", backoff.NumRetries(), err) +} diff --git a/tools/dev/experiment/values-micro-services-experiment.yaml b/tools/dev/experiment/values-micro-services-experiment.yaml index f7782d25fc..6c4649886f 100644 --- a/tools/dev/experiment/values-micro-services-experiment.yaml +++ b/tools/dev/experiment/values-micro-services-experiment.yaml @@ -5,7 +5,7 @@ pyroscope: query-backend.address: "dns:///_grpc._tcp.pyroscope-query-worker-headless.$(NAMESPACE_FQDN):9095" metastore.address: "dns:///_grpc._tcp.pyroscope-metastore-headless.$(NAMESPACE_FQDN):9095" metastore.raft.bind-address: ":9099" - metastore.raft.server-id: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN)" + metastore.raft.server-id: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099" metastore.raft.advertise-address: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099" metastore.raft.bootstrap-peers: "dnssrvnoa+_raft._tcp.pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099" metastore.raft.bootstrap-expect-peers: "3"