Skip to content

Commit

Permalink
[Feature] Abort resignation of leadership when dbserver restarted (#1291
Browse files Browse the repository at this point in the history
)
  • Loading branch information
informalict authored Jul 20, 2023
1 parent 4f12875 commit 8949920
Show file tree
Hide file tree
Showing 13 changed files with 187 additions and 36 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- (Feature) Run secured containers as a feature
- (Feature) Expose core.PodSecurityContext Sysctl options
- (Bugfix) Skip Collection check for missing Database
- (Feature) Abort resignation of leadership when DB server is restared

## [1.2.31](https://github.com/arangodb/kube-arangodb/tree/1.2.31) (2023-07-14)
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode
Expand Down
8 changes: 7 additions & 1 deletion pkg/apis/deployment/v1/plan_locals.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -20,12 +20,18 @@

package v1

import "fmt"

type PlanLocalKey string

func (p PlanLocalKey) String() string {
return string(p)
}

func (p PlanLocalKey) Register(action Action, format string, args ...interface{}) Action {
return action.AddParam(p.String(), fmt.Sprintf(format, args...))
}

type PlanLocals map[PlanLocalKey]string

func (p *PlanLocals) Remove(key PlanLocalKey) bool {
Expand Down
8 changes: 7 additions & 1 deletion pkg/apis/deployment/v2alpha1/plan_locals.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -20,12 +20,18 @@

package v2alpha1

import "fmt"

type PlanLocalKey string

func (p PlanLocalKey) String() string {
return string(p)
}

func (p PlanLocalKey) Register(action Action, format string, args ...interface{}) Action {
return action.AddParam(p.String(), fmt.Sprintf(format, args...))
}

type PlanLocals map[PlanLocalKey]string

func (p *PlanLocals) Remove(key PlanLocalKey) bool {
Expand Down
6 changes: 2 additions & 4 deletions pkg/deployment/agency/definitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const (
TargetKey = "Target"

CurrentMaintenanceServers = "MaintenanceServers"
CurrentServersKnown = "ServersKnown"

TargetHotBackupKey = "HotBackup"

Expand Down Expand Up @@ -64,10 +65,6 @@ func GetAgencyKey(parts ...string) string {
return fmt.Sprintf("/%s", strings.Join(parts, "/"))
}

func GetAgencyReadKey(elements ...string) []string {
return elements
}

func GetAgencyReadRequest(elements ...[]string) ReadRequest {
return elements
}
Expand All @@ -78,6 +75,7 @@ func GetAgencyReadRequestFields() ReadRequest {
GetAgencyKey(ArangoKey, PlanKey, PlanCollectionsKey),
GetAgencyKey(ArangoKey, PlanKey, PlanDatabasesKey),
GetAgencyKey(ArangoKey, CurrentKey, PlanCollectionsKey),
GetAgencyKey(ArangoKey, CurrentKey, CurrentServersKnown),
GetAgencyKey(ArangoKey, CurrentKey, CurrentMaintenanceServers),
GetAgencyKey(ArangoKey, TargetKey, TargetHotBackupKey),
GetAgencyKey(ArangoKey, TargetKey, TargetJobToDoKey),
Expand Down
21 changes: 21 additions & 0 deletions pkg/deployment/agency/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

package state

import "github.com/arangodb/go-driver"

type Root struct {
Arango State `json:"arango"`
ArangoDB DB `json:"arangodb,omitempty"`
Expand All @@ -40,9 +42,18 @@ type State struct {
Target Target `json:"Target"`
}

// ServerKnown stores information about single ArangoDB server.
type ServerKnown struct {
// RebootID is an incremental value which describes how many times server was restarted.
RebootID int `json:"rebootId"`
}

type Current struct {
MaintenanceServers CurrentMaintenanceServers `json:"MaintenanceServers,omitempty"`
Collections CurrentCollections `json:"Collections"`

// ServersKnown stores information about ArangoDB servers.
ServersKnown map[driver.ServerID]ServerKnown `json:"ServersKnown,omitempty"`
}

type Plan struct {
Expand Down Expand Up @@ -371,3 +382,13 @@ func (s State) GetCollectionDatabaseByID(id string) (string, bool) {

return "", false
}

// GetRebootID returns reboot ID for a given server ID.
// returns false when a server ID does not exist in cache.
func (s State) GetRebootID(id driver.ServerID) (int, bool) {
if v, ok := s.Current.ServersKnown[id]; ok {
return v.RebootID, true
}

return 0, false
}
17 changes: 17 additions & 0 deletions pkg/deployment/agency/state/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,20 @@ func Test_MissingDatabaseCase(t *testing.T) {

require.Len(t, GetDBServerBlockingRestartShards(s, "PRMR-1e4bxazq"), 0)
}

func Test_GetRebootID(t *testing.T) {
var s DumpState
require.NoError(t, json.Unmarshal(agencyDump39, &s))

t.Run("Existing", func(t *testing.T) {
id, ok := s.Agency.Arango.GetRebootID("PRMR-n92yizyp")
require.True(t, ok)
require.Equal(t, 1, id)
})

t.Run("Missing", func(t *testing.T) {
id, ok := s.Agency.Arango.GetRebootID("PRMR-n92yiz")
require.False(t, ok)
require.Equal(t, 0, id)
})
}
47 changes: 41 additions & 6 deletions pkg/deployment/reconcile/action_resign_leadership.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package reconcile

import (
"context"
"strconv"

"github.com/arangodb/go-driver"

Expand All @@ -31,6 +32,10 @@ import (
"github.com/arangodb/kube-arangodb/pkg/util/globals"
)

const (
actionResignLeadershipRebootID api.PlanLocalKey = "rebootID"
)

// newResignLeadershipAction creates a new Action that implements the given
// planned ResignLeadership action.
func newResignLeadershipAction(action api.Action, actionCtx ActionContext) Action {
Expand Down Expand Up @@ -63,14 +68,14 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
client, err := a.actionCtx.GetMembersState().State().GetDatabaseClient()
if err != nil {
a.log.Err(err).Error("Unable to get client")
return true, errors.WithStack(err)
return false, errors.WithStack(err)
}

switch group {
case api.ServerGroupDBServers:
if agencyState, agencyOK := a.actionCtx.GetAgencyCache(); !agencyOK {
a.log.Err(err).Warn("Maintenance is enabled, skipping action")
return true, errors.WithStack(err)
a.log.Warn("AgencyCache is not ready")
return false, nil
} else if agencyState.Supervision.Maintenance.Exists() {
// We are done, action cannot be handled on maintenance mode
a.log.Warn("Maintenance is enabled, skipping action")
Expand All @@ -82,7 +87,7 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
cluster, err := client.Cluster(ctxChild)
if err != nil {
a.log.Err(err).Error("Unable to get cluster client")
return true, errors.WithStack(err)
return false, errors.WithStack(err)
}

var jobID string
Expand All @@ -92,13 +97,13 @@ func (a *actionResignLeadership) Start(ctx context.Context) (bool, error) {
a.log.Debug("Temporary shutdown, resign leadership")
if err := cluster.ResignServer(jobCtx, m.ID); err != nil {
a.log.Err(err).Debug("Failed to resign server")
return true, errors.WithStack(err)
return false, errors.WithStack(err)
}

m.CleanoutJobID = jobID

if err := a.actionCtx.UpdateMember(ctx, m); err != nil {
return true, errors.WithStack(err)
return false, errors.WithStack(err)
}

return false, nil
Expand Down Expand Up @@ -127,6 +132,8 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
return false, false, errors.WithStack(err)
}
return true, false, nil
} else if a.isServerRebooted(agencyState, driver.ServerID(m.ID)) {
return true, false, nil
}

_, jobStatus := agencyState.Target.GetJob(state.JobID(m.CleanoutJobID))
Expand All @@ -150,3 +157,31 @@ func (a *actionResignLeadership) CheckProgress(ctx context.Context) (bool, bool,
}
return false, false, nil
}

// isServerRebooted returns true when a given server ID was rebooted during resignation of leadership.
func (a *actionResignLeadership) isServerRebooted(agencyState state.State, serverID driver.ServerID) bool {
rebootID, ok := agencyState.GetRebootID(serverID)
if !ok {
return false
}

v, ok := a.actionCtx.Get(a.action, actionResignLeadershipRebootID)
if !ok {
a.log.Warn("missing reboot ID in action's locals", v)
return false
}

r, err := strconv.Atoi(v)
if err != nil {
a.log.Err(err).Warn("reboot ID '%s' supposed to be a number", v)
return false
}

if rebootID <= r {
// Server has not been restarted.
return false
}

a.log.Warn("resign leadership aborted because rebootID has changed from %d to %d", r, rebootID)
return true
}
11 changes: 8 additions & 3 deletions pkg/deployment/reconcile/helper_wrap.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -59,12 +59,17 @@ func withMemberMaintenance(group api.ServerGroup, member api.MemberStatus, reaso
actions.NewAction(api.ActionTypeDisableMemberMaintenance, group, member, reason))
}

func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason string, plan api.Plan) api.Plan {
func withResignLeadership(group api.ServerGroup, member api.MemberStatus, reason string, plan api.Plan, rebootID *int) api.Plan {
if member.Image == nil {
return plan
}

return api.AsPlan(plan).Before(actions.NewAction(api.ActionTypeResignLeadership, group, member, reason))
action := actions.NewAction(api.ActionTypeResignLeadership, group, member, reason)
if rebootID != nil {
action = actionResignLeadershipRebootID.Register(action, "%d", *rebootID)
}

return api.AsPlan(plan).Before(action)
}

func cleanOutMember(group api.ServerGroup, m api.MemberStatus) api.Plan {
Expand Down
Loading

0 comments on commit 8949920

Please sign in to comment.