From b2ca265767263b7e63cda9e36ecc4c2b65c538d3 Mon Sep 17 00:00:00 2001 From: jwierzbo Date: Mon, 24 Apr 2023 16:07:48 +0200 Subject: [PATCH] Make rebuild shard optional feature --- README.md | 1 + cmd/cmd.go | 3 ++ docs/design/rebuild_out_synced_shards.md | 19 ++++++++++ .../features/rebuild_out_synced_shards.go | 38 +++++++++++++++++++ .../action_rebuild_outsynced_shards.go | 17 ++++++--- .../plan_builder_rebuild_outsynced_shards.go | 26 ++++++++++--- pkg/util/globals/global.go | 12 +++++- 7 files changed, 102 insertions(+), 14 deletions(-) create mode 100644 docs/design/rebuild_out_synced_shards.md create mode 100644 pkg/deployment/features/rebuild_out_synced_shards.go diff --git a/README.md b/README.md index 90a86ef3f..00c54f6a3 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Feature-wise production readiness table: | Operator Internal Metrics Exporter | 1.2.0 | >= 3.6.0 | Community, Enterprise | 1.2.0 | Production | True | --deployment.feature.metrics-exporter | N/A | | Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | 1.2.2 | Alpha | False | --deployment.feature.ephemeral-volumes | N/A | | Spec Default Restore | 1.2.21 | >= 3.7.0 | Community, Enterprise | 1.2.21 | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | +| Force Rebuild Out Synced Shards | 1.2.27 | >= 3.8.0 | Community, Enterprise | 1.2.27 | Beta | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | ## Operator Community Edition (CE) diff --git a/cmd/cmd.go b/cmd/cmd.go index b87f1d0d0..5c9635dd6 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -159,6 +159,7 @@ var ( arangoDCheck time.Duration reconciliation time.Duration agency time.Duration + shardRebuild time.Duration } chaosOptions struct { allowed bool @@ -211,6 +212,7 @@ func init() { f.DurationVar(&operatorTimeouts.arangoDCheck, "timeout.arangod-check", globals.DefaultArangoDCheckTimeout, "The version check request timeout to the ArangoDB") f.DurationVar(&operatorTimeouts.agency, "timeout.agency", globals.DefaultArangoDAgencyTimeout, "The Agency read timeout") f.DurationVar(&operatorTimeouts.reconciliation, "timeout.reconciliation", globals.DefaultReconciliationTimeout, "The reconciliation timeout to the ArangoDB CR") + f.DurationVar(&operatorTimeouts.shardRebuild, "timeout.shard-rebuild", globals.DefaultOutSyncedShardRebuildTimeout, "Timeout after which particular out-synced shard is rebuilt") f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers") f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers") f.BoolVar(&operatorOptions.scalingIntegrationEnabled, "internal.scaling-integration", false, "Enable Scaling Integration") @@ -257,6 +259,7 @@ func executeMain(cmd *cobra.Command, args []string) { globals.GetGlobalTimeouts().Agency().Set(operatorTimeouts.agency) globals.GetGlobalTimeouts().ArangoDCheck().Set(operatorTimeouts.arangoDCheck) globals.GetGlobalTimeouts().Reconciliation().Set(operatorTimeouts.reconciliation) + globals.GetGlobalTimeouts().ShardRebuild().Set(operatorTimeouts.shardRebuild) globals.GetGlobals().Kubernetes().RequestBatchSize().Set(operatorKubernetesOptions.maxBatchSize) globals.GetGlobals().Backup().ConcurrentUploads().Set(operatorBackup.concurrentUploads) diff --git a/docs/design/rebuild_out_synced_shards.md b/docs/design/rebuild_out_synced_shards.md new file mode 100644 index 000000000..06f6a740a --- /dev/null +++ b/docs/design/rebuild_out_synced_shards.md @@ -0,0 +1,19 @@ +# Force rebuild out-synced Shards with broken Merkle Tree + +## Overview + +TODO + +## How to use + +This feature is disabled by default. To enable it `--deployment.feature.force-rebuild-out-synced-shards` arg needs be passed to the operator. +We can also change default timeout value (60 min) for this feature by `--timeout.shard-rebuild duration` arg. + +Here is the example `helm` command which enables this feature and sets timeout to 10 minutes: +```shell +export VER=1.2.26; helm upgrade --install kube-arangodb \ +https://github.com/arangodb/kube-arangodb/releases/download/$VER/kube-arangodb-$VER.tgz \ + --set operator.imagePullPolicy=Always \ + --set "operator.args={--deployment.feature.force-rebuild-out-synced-shards,--timeout.shard-rebuild=10m}" \ + --set operator.image=wierzbiks/kube-arangodb:$IMAGETAG +``` diff --git a/pkg/deployment/features/rebuild_out_synced_shards.go b/pkg/deployment/features/rebuild_out_synced_shards.go new file mode 100644 index 000000000..02c4520cb --- /dev/null +++ b/pkg/deployment/features/rebuild_out_synced_shards.go @@ -0,0 +1,38 @@ +// +// DISCLAIMER +// +// Copyright 2023 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package features + +func init() { + registerFeature(rebuildOutSyncedShards) +} + +var rebuildOutSyncedShards = &feature{ + name: "force-rebuild-out-synced-shards", + description: "Force rebuild permanently out-synced shards due to a bug in ArangoDB 3.8-3.10", + version: "3.8.0", + enterpriseRequired: true, + enabledByDefault: false, + hidden: true, +} + +func RebuildOutSyncedShards() Feature { + return rebuildOutSyncedShards +} diff --git a/pkg/deployment/reconcile/action_rebuild_outsynced_shards.go b/pkg/deployment/reconcile/action_rebuild_outsynced_shards.go index 6f5860544..66457fd6b 100644 --- a/pkg/deployment/reconcile/action_rebuild_outsynced_shards.go +++ b/pkg/deployment/reconcile/action_rebuild_outsynced_shards.go @@ -34,11 +34,10 @@ import ( "github.com/arangodb/kube-arangodb/pkg/deployment/agency" "github.com/arangodb/kube-arangodb/pkg/util/arangod/conn" "github.com/arangodb/kube-arangodb/pkg/util/errors" + "github.com/arangodb/kube-arangodb/pkg/util/globals" ) const ( - // TODO make it configurable - ttlRebuildOutSyncedShards time.Duration = 600 actionRebuildOutSyncedShardsLocalJobID api.PlanLocalKey = "rebuildJobID" actionRebuildOutSyncedShardsLocalDatabase api.PlanLocalKey = "database" actionRebuildOutSyncedShardsLocalShard api.PlanLocalKey = "shard" @@ -257,7 +256,7 @@ func (a *actionRebuildOutSyncedShards) createBatch(ctx context.Context, clientSy } params := struct { TTL float64 `json:"ttl"` - }{TTL: ttlRebuildOutSyncedShards.Seconds()} + }{TTL: globals.GetGlobalTimeouts().ShardRebuild().Get().Seconds()} req, err = req.SetBody(params) if err != nil { return "", errors.Wrapf(err, "Unable to add body to the batch creation request") @@ -282,11 +281,17 @@ func (a *actionRebuildOutSyncedShards) createBatch(ctx context.Context, clientSy // deleteBatch removes batch from the server func (a *actionRebuildOutSyncedShards) deleteBatch(ctx context.Context, clientSync driver.Client, batchID string) error { - req, err := clientSync.Connection().NewRequest("POST", path.Join("_api/replication/batch", batchID)) + req, err := clientSync.Connection().NewRequest("DELETE", path.Join("_api/replication/batch", batchID)) if err != nil { return errors.Wrapf(err, "Unable to create request for batch removal") } - _, err = clientSync.Connection().Do(ctx, req) - return err + resp, err := clientSync.Connection().Do(ctx, req) + if err != nil { + return errors.Wrapf(err, "Unable to remove batch, request failed") + } + if err := resp.CheckStatus(204); err != nil { + return errors.Wrapf(err, "Unable to remove batch, wrong status code %d", resp.StatusCode()) + } + return nil } diff --git a/pkg/deployment/reconcile/plan_builder_rebuild_outsynced_shards.go b/pkg/deployment/reconcile/plan_builder_rebuild_outsynced_shards.go index 3df098aba..b1b2e9670 100644 --- a/pkg/deployment/reconcile/plan_builder_rebuild_outsynced_shards.go +++ b/pkg/deployment/reconcile/plan_builder_rebuild_outsynced_shards.go @@ -22,35 +22,49 @@ package reconcile import ( "context" + "strings" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/deployment/actions" "github.com/arangodb/kube-arangodb/pkg/deployment/agency" + "github.com/arangodb/kube-arangodb/pkg/deployment/features" + "github.com/arangodb/kube-arangodb/pkg/util/globals" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) -// TODO use it only if is enabled in operator explicitly -// TODO consider to use it as internal method in Scale/Upgrade/Restart plans // createRotateOrUpgradePlan func (r *Reconciler) createRebuildOutSyncedPlan(ctx context.Context, apiObject k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { var plan api.Plan + if !features.RebuildOutSyncedShards().Enabled() { + // RebuildOutSyncedShards feature is not enabled + return nil + } + + // TODO: use that to check if we need to rebuild shards + r.log.Info("Rebuilding out-synced shards timeout", globals.GetGlobalTimeouts().ShardRebuild().Get()) + agencyState, ok := context.GetAgencyCache() if !ok { // Unable to get agency state, do not restart - r.planLogger.Error("Unable to get agency state") - plan = append(plan, actions.NewClusterAction(api.ActionTypeIdle)) - return plan + r.log.Error("Unable to get agency state") + return nil } for _, m := range status.Members.AsList() { if m.Group == api.ServerGroupDBServers { + // use * for global notInSyncShards := agency.GetDBServerShardsNotInSync(agencyState, agency.Server(m.Member.ID)) if s := len(notInSyncShards); s > 0 { - m.Member.Conditions.Update(api.ConditionTypeOutSyncedShards, true, "Member has out-synced shard(s)", "") + var shardsID []string + for _, shard := range notInSyncShards { + shardsID = append(shardsID, shard.Shard) + } + + m.Member.Conditions.Update(api.ConditionTypeOutSyncedShards, true, "Member has out-synced shard(s)", strings.Join(shardsID, ", ")) plan = append(plan, actions.NewAction(api.ActionTypeRebuildOutSyncedShards, api.ServerGroupDBServers, m.Member)) } } diff --git a/pkg/util/globals/global.go b/pkg/util/globals/global.go index 7dc1ae923..6c204f518 100644 --- a/pkg/util/globals/global.go +++ b/pkg/util/globals/global.go @@ -1,7 +1,7 @@ // // DISCLAIMER // -// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany +// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ const ( DefaultArangoDCheckTimeout = time.Second * 2 DefaultReconciliationTimeout = time.Minute + DefaultOutSyncedShardRebuildTimeout = time.Minute * 60 + DefaultKubernetesRequestBatchSize = 256 DefaultBackupConcurrentUploads = 4 @@ -41,6 +43,7 @@ var globalObj = &globals{ arangodCheck: NewTimeout(DefaultArangoDCheckTimeout), reconciliation: NewTimeout(DefaultReconciliationTimeout), agency: NewTimeout(DefaultArangoDAgencyTimeout), + shardRebuild: NewTimeout(DefaultOutSyncedShardRebuildTimeout), }, kubernetes: &globalKubernetes{ requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize), @@ -108,6 +111,7 @@ func (g *globalBackup) ConcurrentUploads() Int { type GlobalTimeouts interface { Reconciliation() Timeout + ShardRebuild() Timeout Kubernetes() Timeout ArangoD() Timeout @@ -116,7 +120,7 @@ type GlobalTimeouts interface { } type globalTimeouts struct { - requests, arangod, reconciliation, arangodCheck, agency Timeout + requests, arangod, reconciliation, arangodCheck, agency, shardRebuild Timeout } func (g *globalTimeouts) Agency() Timeout { @@ -131,6 +135,10 @@ func (g *globalTimeouts) Reconciliation() Timeout { return g.reconciliation } +func (g *globalTimeouts) ShardRebuild() Timeout { + return g.shardRebuild +} + func (g *globalTimeouts) ArangoD() Timeout { return g.arangod }