Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry reads when ES unavailable #883

Merged
merged 5 commits into from
Nov 30, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions internal/pkg/coordinator/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package coordinator
import (
"context"
"errors"
"fmt"
"net"
"os"
"runtime"
Expand Down Expand Up @@ -132,15 +133,15 @@ func (m *monitorT) Run(ctx context.Context) (err error) {
case hits := <-s.Output():
err = m.handlePolicies(ctx, hits)
if err != nil {
return err
m.log.Warn().Err(err).Msgf("Encountered an error while policy leadership changes; continuing to retry.")
}
case <-mT.C:
m.calcMetadata()
mT.Reset(m.metadataInterval)
case <-lT.C:
err = m.ensureLeadership(ctx)
if err != nil {
return err
m.log.Warn().Err(err).Msgf("Encountered an error while checking/assigning policy leaders; continuing to retry.")
}
lT.Reset(m.checkInterval)
case <-ctx.Done():
Expand All @@ -157,6 +158,7 @@ func (m *monitorT) handlePolicies(ctx context.Context, hits []es.HitT) error {
var policy model.Policy
err := hit.Unmarshal(&policy)
if err != nil {
m.log.Debug().Err(err).Msg("Failed to deserialize policy json")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason you put this on Debug level? Do you expect this to happen often?

return err
}
if policy.CoordinatorIdx != 0 {
Expand All @@ -170,6 +172,7 @@ func (m *monitorT) handlePolicies(ctx context.Context, hits []es.HitT) error {
// current leader send to its coordinator
err = p.cord.Update(ctx, policy)
if err != nil {
m.log.Info().Err(err).Msg("Failed to update policy leader")
return err
}
}
Expand All @@ -192,8 +195,9 @@ func (m *monitorT) handlePolicies(ctx context.Context, hits []es.HitT) error {
func (m *monitorT) ensureLeadership(ctx context.Context) error {
m.log.Debug().Msg("ensuring leadership of policies")
err := dl.EnsureServer(ctx, m.bulker, m.version, m.agentMetadata, m.hostMetadata, dl.WithIndexName(m.serversIndex))

if err != nil {
return err
return fmt.Errorf("Failed to check server status on Elasticsearch (%s): %w", m.hostMetadata.Name, err)
}

// fetch current policies and leaders
Expand All @@ -204,7 +208,7 @@ func (m *monitorT) ensureLeadership(ctx context.Context) error {
m.log.Debug().Str("index", m.policiesIndex).Msg(es.ErrIndexNotFound.Error())
return nil
}
return err
return fmt.Errorf("Encountered error while querying policies: %w", err)
}
if len(policies) > 0 {
ids := make([]string, len(policies))
Expand All @@ -214,7 +218,7 @@ func (m *monitorT) ensureLeadership(ctx context.Context) error {
leaders, err = dl.SearchPolicyLeaders(ctx, m.bulker, ids, dl.WithIndexName(m.leadersIndex))
if err != nil {
if !errors.Is(err, es.ErrIndexNotFound) {
return err
return fmt.Errorf("Encountered error while fetching policy leaders: %w", err)
}
}
}
Expand Down