Skip to content

Commit

Permalink
Make rebuild shard optional feature
Browse files Browse the repository at this point in the history
  • Loading branch information
jwierzbo committed Apr 24, 2023
1 parent 59e237b commit b2ca265
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 14 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Feature-wise production readiness table:
| Operator Internal Metrics Exporter | 1.2.0 | >= 3.6.0 | Community, Enterprise | 1.2.0 | Production | True | --deployment.feature.metrics-exporter | N/A |
| Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | 1.2.2 | Alpha | False | --deployment.feature.ephemeral-volumes | N/A |
| Spec Default Restore | 1.2.21 | >= 3.7.0 | Community, Enterprise | 1.2.21 | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec |
| Force Rebuild Out Synced Shards | 1.2.27 | >= 3.8.0 | Community, Enterprise | 1.2.27 | Beta | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. |

## Operator Community Edition (CE)

Expand Down
3 changes: 3 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ var (
arangoDCheck time.Duration
reconciliation time.Duration
agency time.Duration
shardRebuild time.Duration
}
chaosOptions struct {
allowed bool
Expand Down Expand Up @@ -211,6 +212,7 @@ func init() {
f.DurationVar(&operatorTimeouts.arangoDCheck, "timeout.arangod-check", globals.DefaultArangoDCheckTimeout, "The version check request timeout to the ArangoDB")
f.DurationVar(&operatorTimeouts.agency, "timeout.agency", globals.DefaultArangoDAgencyTimeout, "The Agency read timeout")
f.DurationVar(&operatorTimeouts.reconciliation, "timeout.reconciliation", globals.DefaultReconciliationTimeout, "The reconciliation timeout to the ArangoDB CR")
f.DurationVar(&operatorTimeouts.shardRebuild, "timeout.shard-rebuild", globals.DefaultOutSyncedShardRebuildTimeout, "Timeout after which particular out-synced shard is rebuilt")
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
f.BoolVar(&operatorOptions.scalingIntegrationEnabled, "internal.scaling-integration", false, "Enable Scaling Integration")
Expand Down Expand Up @@ -257,6 +259,7 @@ func executeMain(cmd *cobra.Command, args []string) {
globals.GetGlobalTimeouts().Agency().Set(operatorTimeouts.agency)
globals.GetGlobalTimeouts().ArangoDCheck().Set(operatorTimeouts.arangoDCheck)
globals.GetGlobalTimeouts().Reconciliation().Set(operatorTimeouts.reconciliation)
globals.GetGlobalTimeouts().ShardRebuild().Set(operatorTimeouts.shardRebuild)
globals.GetGlobals().Kubernetes().RequestBatchSize().Set(operatorKubernetesOptions.maxBatchSize)
globals.GetGlobals().Backup().ConcurrentUploads().Set(operatorBackup.concurrentUploads)

Expand Down
19 changes: 19 additions & 0 deletions docs/design/rebuild_out_synced_shards.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Force rebuild out-synced Shards with broken Merkle Tree

## Overview

TODO

## How to use

This feature is disabled by default. To enable it `--deployment.feature.force-rebuild-out-synced-shards` arg needs be passed to the operator.
We can also change default timeout value (60 min) for this feature by `--timeout.shard-rebuild duration` arg.

Here is the example `helm` command which enables this feature and sets timeout to 10 minutes:
```shell
export VER=1.2.26; helm upgrade --install kube-arangodb \
https://github.com/arangodb/kube-arangodb/releases/download/$VER/kube-arangodb-$VER.tgz \
--set operator.imagePullPolicy=Always \
--set "operator.args={--deployment.feature.force-rebuild-out-synced-shards,--timeout.shard-rebuild=10m}" \
--set operator.image=wierzbiks/kube-arangodb:$IMAGETAG
```
38 changes: 38 additions & 0 deletions pkg/deployment/features/rebuild_out_synced_shards.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// DISCLAIMER
//
// Copyright 2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//

package features

func init() {
registerFeature(rebuildOutSyncedShards)
}

var rebuildOutSyncedShards = &feature{
name: "force-rebuild-out-synced-shards",
description: "Force rebuild permanently out-synced shards due to a bug in ArangoDB 3.8-3.10",
version: "3.8.0",
enterpriseRequired: true,
enabledByDefault: false,
hidden: true,
}

func RebuildOutSyncedShards() Feature {
return rebuildOutSyncedShards
}
17 changes: 11 additions & 6 deletions pkg/deployment/reconcile/action_rebuild_outsynced_shards.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@ import (
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/util/arangod/conn"
"github.com/arangodb/kube-arangodb/pkg/util/errors"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
)

const (
// TODO make it configurable
ttlRebuildOutSyncedShards time.Duration = 600
actionRebuildOutSyncedShardsLocalJobID api.PlanLocalKey = "rebuildJobID"
actionRebuildOutSyncedShardsLocalDatabase api.PlanLocalKey = "database"
actionRebuildOutSyncedShardsLocalShard api.PlanLocalKey = "shard"
Expand Down Expand Up @@ -257,7 +256,7 @@ func (a *actionRebuildOutSyncedShards) createBatch(ctx context.Context, clientSy
}
params := struct {
TTL float64 `json:"ttl"`
}{TTL: ttlRebuildOutSyncedShards.Seconds()}
}{TTL: globals.GetGlobalTimeouts().ShardRebuild().Get().Seconds()}
req, err = req.SetBody(params)
if err != nil {
return "", errors.Wrapf(err, "Unable to add body to the batch creation request")
Expand All @@ -282,11 +281,17 @@ func (a *actionRebuildOutSyncedShards) createBatch(ctx context.Context, clientSy

// deleteBatch removes batch from the server
func (a *actionRebuildOutSyncedShards) deleteBatch(ctx context.Context, clientSync driver.Client, batchID string) error {
req, err := clientSync.Connection().NewRequest("POST", path.Join("_api/replication/batch", batchID))
req, err := clientSync.Connection().NewRequest("DELETE", path.Join("_api/replication/batch", batchID))
if err != nil {
return errors.Wrapf(err, "Unable to create request for batch removal")
}

_, err = clientSync.Connection().Do(ctx, req)
return err
resp, err := clientSync.Connection().Do(ctx, req)
if err != nil {
return errors.Wrapf(err, "Unable to remove batch, request failed")
}
if err := resp.CheckStatus(204); err != nil {
return errors.Wrapf(err, "Unable to remove batch, wrong status code %d", resp.StatusCode())
}
return nil
}
26 changes: 20 additions & 6 deletions pkg/deployment/reconcile/plan_builder_rebuild_outsynced_shards.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,35 +22,49 @@ package reconcile

import (
"context"
"strings"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

// TODO use it only if is enabled in operator explicitly
// TODO consider to use it as internal method in Scale/Upgrade/Restart plans
// createRotateOrUpgradePlan
func (r *Reconciler) createRebuildOutSyncedPlan(ctx context.Context, apiObject k8sutil.APIObject,
spec api.DeploymentSpec, status api.DeploymentStatus,
context PlanBuilderContext) api.Plan {
var plan api.Plan

if !features.RebuildOutSyncedShards().Enabled() {
// RebuildOutSyncedShards feature is not enabled
return nil
}

// TODO: use that to check if we need to rebuild shards
r.log.Info("Rebuilding out-synced shards timeout", globals.GetGlobalTimeouts().ShardRebuild().Get())

agencyState, ok := context.GetAgencyCache()
if !ok {
// Unable to get agency state, do not restart
r.planLogger.Error("Unable to get agency state")
plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle))
return plan
r.log.Error("Unable to get agency state")
return nil
}

for _, m := range status.Members.AsList() {
if m.Group == api.ServerGroupDBServers {
// use * for global
notInSyncShards := agency.GetDBServerShardsNotInSync(agencyState, agency.Server(m.Member.ID))

if s := len(notInSyncShards); s > 0 {
m.Member.Conditions.Update(api.ConditionTypeOutSyncedShards, true, "Member has out-synced shard(s)", "")
var shardsID []string
for _, shard := range notInSyncShards {
shardsID = append(shardsID, shard.Shard)
}

m.Member.Conditions.Update(api.ConditionTypeOutSyncedShards, true, "Member has out-synced shard(s)", strings.Join(shardsID, ", "))
plan = append(plan, actions.NewAction(api.ActionTypeRebuildOutSyncedShards, api.ServerGroupDBServers, m.Member))
}
}
Expand Down
12 changes: 10 additions & 2 deletions pkg/util/globals/global.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//
// DISCLAIMER
//
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,8 @@ const (
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute

DefaultOutSyncedShardRebuildTimeout = time.Minute * 60

DefaultKubernetesRequestBatchSize = 256

DefaultBackupConcurrentUploads = 4
Expand All @@ -41,6 +43,7 @@ var globalObj = &globals{
arangodCheck: NewTimeout(DefaultArangoDCheckTimeout),
reconciliation: NewTimeout(DefaultReconciliationTimeout),
agency: NewTimeout(DefaultArangoDAgencyTimeout),
shardRebuild: NewTimeout(DefaultOutSyncedShardRebuildTimeout),
},
kubernetes: &globalKubernetes{
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
Expand Down Expand Up @@ -108,6 +111,7 @@ func (g *globalBackup) ConcurrentUploads() Int {

type GlobalTimeouts interface {
Reconciliation() Timeout
ShardRebuild() Timeout

Kubernetes() Timeout
ArangoD() Timeout
Expand All @@ -116,7 +120,7 @@ type GlobalTimeouts interface {
}

type globalTimeouts struct {
requests, arangod, reconciliation, arangodCheck, agency Timeout
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild Timeout
}

func (g *globalTimeouts) Agency() Timeout {
Expand All @@ -131,6 +135,10 @@ func (g *globalTimeouts) Reconciliation() Timeout {
return g.reconciliation
}

func (g *globalTimeouts) ShardRebuild() Timeout {
return g.shardRebuild
}

func (g *globalTimeouts) ArangoD() Timeout {
return g.arangod
}
Expand Down

0 comments on commit b2ca265

Please sign in to comment.