From 80e1ccd0118060dde0c4342262c379ecbc5bf356 Mon Sep 17 00:00:00 2001 From: amecea Date: Thu, 30 May 2019 19:55:04 +0300 Subject: [PATCH] Check for in progress failover and add events on cluster realted to orchestrator --- cmd/orchestrator-helper/main.go | 65 +++++++++++++++---- .../{deployment.yaml => statefulset.yaml} | 0 hack/charts/mysql-operator/values.yaml | 25 +++++++ hack/development/Dockerfile.orchestrator | 3 +- pkg/controller/node/node_controller.go | 9 +++ .../orchestrator/orchestrator_reconcile.go | 7 ++ pkg/orc-helper/helper.go | 58 ++++++++++++++++- 7 files changed, 150 insertions(+), 17 deletions(-) rename hack/charts/mysql-operator/templates/{deployment.yaml => statefulset.yaml} (100%) diff --git a/cmd/orchestrator-helper/main.go b/cmd/orchestrator-helper/main.go index 9197bf743..62861caaf 100644 --- a/cmd/orchestrator-helper/main.go +++ b/cmd/orchestrator-helper/main.go @@ -17,16 +17,15 @@ limitations under the License. package main import ( - "log" - "os" - + "github.com/presslabs/mysql-operator/pkg/orc-helper" + "github.com/spf13/cobra" "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" + "log" kclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" "github.com/presslabs/mysql-operator/pkg/apis" - "github.com/presslabs/mysql-operator/pkg/orc-helper" ) var ( @@ -34,31 +33,71 @@ var ( ) func main() { - // check command line args - if len(os.Args) != 2 { - log.Fatal("see usage: ") + + cmd := &cobra.Command{ + Use: "orchestrator-helper", + Short: "Helper for orchestrator.", + Long: `This command is a helper for updating MySQL cluster resources. Record events.`, + Run: func(cmd *cobra.Command, args []string) { + log.Fatal("you run orchestrator helper, see help section") + }, } // Get a config to talk to the apiserver cfg, err := config.GetConfig() if err != nil { - log.Fatal("unable to get configuration", err) + log.Fatal("unable to get configuration: ", err) } // Setup Scheme for all resources s := scheme.Scheme if err = apis.AddToScheme(s); err != nil { - log.Fatal("unable to register types to scheme", err) + log.Fatal("unable to register types to scheme: ", err) } // initialize k8s client client, err = kclient.New(cfg, kclient.Options{Scheme: s}) if err != nil { - log.Fatal("unable to get the k8s client", err) + log.Fatal("unable to get the k8s client: ", err) } - err = orchelper.UpdateClusterFailoverCond(client, os.Args[1], "orcFailoverInProgress", "Orc failover in progress", true) - if err != nil { - log.Fatal("error in updating cluster: ", err) + fipCmd := &cobra.Command{ + Use: "failover-in-progress", + Short: "Set failover in progress condition for given cluster", + Run: func(cmd *cobra.Command, args []string) { + // check command line args + if len(args) != 2 { + log.Fatal("see usage: ") + } + + err = orchelper.UpdateClusterFailoverCond(client, args[0], "OrcFailoverInProgress", args[1], true) + if err != nil { + log.Fatal("error in updating cluster: ", err) + } + }, + } + cmd.AddCommand(fipCmd) + + evWarningType := false + evCmd := &cobra.Command{ + Use: "event", + Short: "Set event on a given cluster", + Run: func(cmd *cobra.Command, args []string) { + // check command line args + if len(args) != 3 { + log.Fatal("see usage: [-warning]") + } + + err = orchelper.UpdateEventForCluster(client, s, args[0], args[1], args[2], evWarningType) + if err != nil { + log.Fatal("error in updating cluster: ", err) + } + }, + } + evCmd.Flags().BoolVarP(&evWarningType, "warning", "w", false, "if it's a warning event in k8s") + cmd.AddCommand(evCmd) + + if err := cmd.Execute(); err != nil { + log.Fatal("failed to execute command: ", err) } } diff --git a/hack/charts/mysql-operator/templates/deployment.yaml b/hack/charts/mysql-operator/templates/statefulset.yaml similarity index 100% rename from hack/charts/mysql-operator/templates/deployment.yaml rename to hack/charts/mysql-operator/templates/statefulset.yaml diff --git a/hack/charts/mysql-operator/values.yaml b/hack/charts/mysql-operator/values.yaml index ad420e7ea..776ebe217 100644 --- a/hack/charts/mysql-operator/values.yaml +++ b/hack/charts/mysql-operator/values.yaml @@ -128,3 +128,28 @@ orchestrator: FailMasterPromotionIfSQLThreadNotUpToDate: true DetachLostReplicasAfterMasterFailover: true + # orchestrator hooks called in the following order + # for more information about template: https://github.com/github/orchestrator/blob/master/go/logic/topology_recovery.go#L256 + ProcessesShellCommand: "sh" + + OnFailureDetectionProcesses: + - "/usr/local/bin/orchestrator-helper event -w '{failureClusterAlias}' 'OrcFailureDetection' 'Failure: {failureType}, failed host: {failedHost}, lost replcas: {lostReplicas}' || true" + - "/usr/local/bin/orchestrator-helper failover-in-progress '{failureClusterAlias}' '{failureDescription}' || true" + + # PreGracefulTakeoverProcesses: + PreFailoverProcesses: + # as backup in case the first request fails + - "/usr/local/bin/orchestrator-helper failover-in-progress '{failureClusterAlias}' '{failureDescription}' || true" + # PostFailoverProcesses: + # - "/usr/local/bin/orchestrator-helper event '{failureClusterAlias}' 'Orc{command}' 'Failure type: {failureType}, failed hosts: {failedHost}, slaves: {countSlaves}' || true" + + PostUnsuccessfulFailoverProcesses: + - "/usr/local/bin/orchestrator-helper event -w '{failureClusterAlias}' 'OrcPostUnsuccessfulFailover' 'Failure: {failureType}, failed host: {failedHost} with {countSlaves} slaves' || true" + + PostMasterFailoverProcesses: + - "/usr/local/bin/orchestrator-helper event '{failureClusterAlias}' 'OrcPostMasterFailover' 'Failure type: {failureType}, new master: {successorHost}, slaves: {slaveHosts}' || true" + + PostIntermediateMasterFailoverProcesses: + - "/usr/local/bin/orchestrator-helper event '{failureClusterAlias}' 'OrcPostIntermediateMasterFailover' 'Failure type: {failureType}, failed hosts: {failedHost}, slaves: {countSlaves}' || true" + + # PostGracefulTakeoverProcesses: diff --git a/hack/development/Dockerfile.orchestrator b/hack/development/Dockerfile.orchestrator index 09fcb131a..370472f81 100644 --- a/hack/development/Dockerfile.orchestrator +++ b/hack/development/Dockerfile.orchestrator @@ -1,4 +1,6 @@ +############################################################################### FROM golang:1.9-alpine as builder + RUN set -ex \ && apk add --no-cache \ bash gcc git musl-dev openssl rsync @@ -16,7 +18,6 @@ RUN set -ex \ && ./script/build ############################################################################### - FROM alpine:3.7 # Create a group and user diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go index dd874d36a..605729229 100644 --- a/pkg/controller/node/node_controller.go +++ b/pkg/controller/node/node_controller.go @@ -66,6 +66,7 @@ func Add(mgr manager.Manager) error { // newReconciler returns a new reconcile.Reconciler func newReconciler(mgr manager.Manager, sqlI sqlFactoryFunc) reconcile.Reconciler { return &ReconcileMysqlNode{ + // TODO(amecea): use client without cache here Client: mgr.GetClient(), scheme: mgr.GetScheme(), recorder: mgr.GetRecorder(controllerName), @@ -141,6 +142,7 @@ type ReconcileMysqlNode struct { // and what is in the MysqlCluster.Spec // Automatically generate RBAC rules to allow the Controller to read and write Deployments // +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch;create;update;patch;delete +// nolint: gocyclo func (r *ReconcileMysqlNode) Reconcile(request reconcile.Request) (reconcile.Result, error) { ctx, cancel := context.WithTimeout(context.TODO(), mysqlReconciliationTimeout) defer cancel() @@ -173,6 +175,13 @@ func (r *ReconcileMysqlNode) Reconcile(request reconcile.Request) (reconcile.Res return reconcile.Result{}, nil } + // check if there is an in progress failover. K8s cluster resource may be inconsistent with what exists in k8s + fip := cluster.GetClusterCondition(api.ClusterConditionFailoverInProgress) + if fip != nil && fip.Status == corev1.ConditionTrue { + log.Info("cluster has failover in progress, given up to sync new node", "pod", pod.Spec.Hostname, "time", fip.LastTransitionTime) + return reconcile.Result{}, nil + } + // if it's a old version cluster then don't do anything if shouldUpdateToVersion(cluster, 300) { // if the cluster is upgraded then set on the cluster an annotations that skips the GTID configuration diff --git a/pkg/controller/orchestrator/orchestrator_reconcile.go b/pkg/controller/orchestrator/orchestrator_reconcile.go index 593cc8aa4..d7e1cd14f 100644 --- a/pkg/controller/orchestrator/orchestrator_reconcile.go +++ b/pkg/controller/orchestrator/orchestrator_reconcile.go @@ -254,6 +254,13 @@ func (ou *orcUpdater) updateStatusFromOrc(insts InstancesSet, master *orc.Instan ou.cluster.UpdateStatusCondition(api.ClusterConditionReadOnly, core.ConditionFalse, "ClusterReadOnlyFalse", "cluster is writable") } + + // check if the master is up to date and is not downtime to remove in progress failover condition + if master != nil && !master.IsDowntimed && master.IsUpToDate { + log.Info("cluster failover finished", "master", master.Key.Hostname) + ou.cluster.UpdateStatusCondition(api.ClusterConditionFailoverInProgress, core.ConditionFalse, + "ClusterMasterHealthy", "Master is healthy in orchestrator") + } } // updateNodesInOrc is the functions that tries to register diff --git a/pkg/orc-helper/helper.go b/pkg/orc-helper/helper.go index 508147c2a..3e4d50bec 100644 --- a/pkg/orc-helper/helper.go +++ b/pkg/orc-helper/helper.go @@ -19,13 +19,18 @@ package orchelper import ( "context" "fmt" + "github.com/presslabs/controller-util/rand" + core "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/reference" + "sigs.k8s.io/controller-runtime/pkg/client" "strings" + "time" api "github.com/presslabs/mysql-operator/pkg/apis/mysql/v1alpha1" "github.com/presslabs/mysql-operator/pkg/internal/mysqlcluster" - core "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" ) // parse the orchestrator cluster name as NamespacedName @@ -67,3 +72,50 @@ func UpdateClusterFailoverCond(c client.Client, clusterName, reason, msg string, return nil } + +// UpdateEventForCluster records an event on MySQL cluster resource +func UpdateEventForCluster(c client.Client, s *runtime.Scheme, clusterName, evReason, evMsg string, warning bool) error { + key, err := orcNameToKey(clusterName) + if err != nil { + return err + } + + cluster := mysqlcluster.New(&api.MysqlCluster{}) + + // get cluster from k8s + if err = c.Get(context.TODO(), key, cluster.Unwrap()); err != nil { + return err + } + + evType := core.EventTypeNormal + if warning { + evType = core.EventTypeWarning + } + + ref, err := reference.GetReference(s, cluster.Unwrap()) + if err != nil { + return err + } + + randStr, err := rand.AlphaNumericString(5) + if err != nil { + return err + } + event := &core.Event{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%s.%d", cluster.Name, randStr, time.Now().Unix()), + Namespace: cluster.Namespace, + }, + FirstTimestamp: metav1.Now(), + Type: evType, + Reason: evReason, + Message: evMsg, + Source: core.EventSource{Component: "orchestrator"}, + InvolvedObject: *ref, + } + if err := c.Create(context.TODO(), event); err != nil { + return err + } + + return nil +}