Skip to content

Commit

Permalink
Merge pull request #153 from arangodb/bugfix/cleanup-long-terminating…
Browse files Browse the repository at this point in the history
…-stateful-pods

Cleanup long terminating stateful pods
  • Loading branch information
ewoutp authored Jun 6, 2018
2 parents 4a2f34b + 12ae458 commit 265397d
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 1 deletion.
2 changes: 2 additions & 0 deletions pkg/apis/deployment/v1alpha/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ const (
// the deployment have changed. Once that is the case, the operator will no longer
// touch the deployment, until the original secrets have been restored.
ConditionTypeSecretsChanged ConditionType = "SecretsChanged"
// ConditionTypeMemberOfCluster indicates that the member is a known member of the ArangoDB cluster.
ConditionTypeMemberOfCluster ConditionType = "MemberOfCluster"
)

// Condition represents one current condition of a deployment or deployment member.
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/deployment/v1alpha/member_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ type MemberStatus struct {
IsInitialized bool `json:"initialized"`
}

// Age returns the duration since the creation timestamp of this member.
func (s MemberStatus) Age() time.Duration {
return time.Since(s.CreatedAt.Time)
}

// RemoveTerminationsBefore removes all recent terminations before the given timestamp.
// It returns the number of terminations that have been removed.
func (s *MemberStatus) RemoveTerminationsBefore(timestamp time.Time) int {
Expand Down
8 changes: 7 additions & 1 deletion pkg/deployment/deployment_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,14 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration
d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject))
}

// Inspect deployment for obsolete members
if err := d.resources.CleanupRemovedMembers(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Removed member cleanup failed", err, d.apiObject))
}

// At the end of the inspect, we cleanup terminated pods.
if d.resources.CleanupTerminatedPods(); err != nil {
if err := d.resources.CleanupTerminatedPods(); err != nil {
hasError = true
d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject))
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/deployment/resources/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ type Context interface {
// CleanupPod deletes a given pod with force and explicit UID.
// If the pod does not exist, the error is ignored.
CleanupPod(p v1.Pod) error
// DeletePod deletes a pod with given name in the namespace
// of the deployment. If the pod does not exist, the error is ignored.
DeletePod(podName string) error
// DeletePvc deletes a persistent volume claim with given name in the namespace
// of the deployment. If the pvc does not exist, the error is ignored.
DeletePvc(pvcName string) error
// GetAgencyClients returns a client connection for every agency member.
GetAgencyClients(ctx context.Context, predicate func(memberID string) bool) ([]driver.Connection, error)
// GetDatabaseClient returns a cached client for the entire database (cluster coordinators or single server),
Expand Down
126 changes: 126 additions & 0 deletions pkg/deployment/resources/member_cleanup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//
// DISCLAIMER
//
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//
// Author Ewout Prangsma
//

package resources

import (
"context"
"time"

driver "github.com/arangodb/go-driver"
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

const (
// minMemberAge is the minimum duration we expect a member to be created before we remove it because
// it is not part of a deployment.
minMemberAge = time.Minute * 10
)

// CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment.
func (r *Resources) CleanupRemovedMembers() error {
// Decide what to do depending on cluster mode
switch r.context.GetSpec().GetMode() {
case api.DeploymentModeCluster:
if err := r.cleanupRemovedClusterMembers(); err != nil {
return maskAny(err)
}
return nil
default:
// Other mode have no concept of cluster in which members can be removed
return nil
}
}

// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster.
func (r *Resources) cleanupRemovedClusterMembers() error {
log := r.log
ctx := context.Background()

// Ask cluster for its health
client, err := r.context.GetDatabaseClient(ctx)
if err != nil {
return maskAny(err)
}
c, err := client.Cluster(ctx)
if err != nil {
return maskAny(err)
}
h, err := c.Health(ctx)
if err != nil {
return maskAny(err)
}

serverFound := func(id string) bool {
_, found := h.Health[driver.ServerID(id)]
return found
}

// For over all members that can be removed
status := r.context.GetStatus()
status.Members.ForeachServerGroup(func(group api.ServerGroup, list *api.MemberStatusList) error {
if group != api.ServerGroupCoordinators && group != api.ServerGroupDBServers {
// We're not interested in these other groups
return nil
}
for _, m := range *list {
if serverFound(m.ID) {
// Member is (still) found, skip it
if m.Conditions.Update(api.ConditionTypeMemberOfCluster, true, "", "") {
list.Update(m)
if err := r.context.UpdateStatus(status); err != nil {
return maskAny(err)
}
}
continue
} else if !m.Conditions.IsTrue(api.ConditionTypeMemberOfCluster) {
// Member is not yet recorded as member of cluster
if m.Age() < minMemberAge {
continue
}
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member has never been part of the cluster for a long time. Removing it.")
} else {
// Member no longer part of cluster, remove it
log.Info().Str("member", m.ID).Str("role", group.AsRole()).Msg("Member is no longer part of the ArangoDB cluster. Removing it.")
}
list.RemoveByID(m.ID)
if err := r.context.UpdateStatus(status); err != nil {
return maskAny(err)
}
// Remove Pod & PVC (if any)
if m.PodName != "" {
if err := r.context.DeletePod(m.PodName); err != nil && !k8sutil.IsNotFound(err) {
log.Warn().Err(err).Str("pod", m.PodName).Msg("Failed to remove obsolete pod")
}
}
if m.PersistentVolumeClaimName != "" {
if err := r.context.DeletePvc(m.PersistentVolumeClaimName); err != nil && !k8sutil.IsNotFound(err) {
log.Warn().Err(err).Str("pvc", m.PersistentVolumeClaimName).Msg("Failed to remove obsolete PVC")
}
}
}
return nil
})

return nil
}

0 comments on commit 265397d

Please sign in to comment.