From 7e5cdc0c1c6305c54930dedb639cc042e2a6bcce Mon Sep 17 00:00:00 2001 From: Francesco Torta <62566275+fra98@users.noreply.github.com> Date: Fri, 18 Oct 2024 17:42:36 +0200 Subject: [PATCH] docs: added section for liqo pods in HA --- cmd/liqoctl/cmd/install.go | 2 -- cmd/webhook/main.go | 2 +- docs/installation/install.md | 7 ++++++- docs/usage/service-continuity.md | 29 ++++++++++++++++------------- pkg/liqoctl/install/handler.go | 10 ---------- 5 files changed, 23 insertions(+), 27 deletions(-) diff --git a/cmd/liqoctl/cmd/install.go b/cmd/liqoctl/cmd/install.go index a9bded4c65..1f941b8cc2 100644 --- a/cmd/liqoctl/cmd/install.go +++ b/cmd/liqoctl/cmd/install.go @@ -156,8 +156,6 @@ func newInstallCommand(ctx context.Context, f *factory.Factory) *cobra.Command { cmd.PersistentFlags().Var(&clusterLabels, "cluster-labels", "The set of labels (i.e., key/value pairs, separated by comma) identifying the current cluster, and propagated to the virtual nodes") - cmd.PersistentFlags().BoolVar(&options.EnableHA, "enable-ha", false, - "Enable the support for high-availability of Liqo components, currently supported by the gateway and the controller manager.") cmd.PersistentFlags().Var(&reservedSubnets, "reserved-subnets", "The private CIDRs to be excluded, as already in use (e.g., the subnet of the cluster nodes); PodCIDR and ServiceCIDR shall not be included.") diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 610d3065bf..c6b3ce6ca3 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -75,7 +75,7 @@ func main() { webhookPort := pflag.Uint("webhook-port", 9443, "The port the webhook server binds to") metricsAddr := pflag.String("metrics-address", ":8080", "The address the metric endpoint binds to") probeAddr := pflag.String("health-probe-address", ":8081", "The address the health probe endpoint binds to") - leaderElection := pflag.Bool("enable-leader-election", false, "Enable leader election for controller manager") + leaderElection := pflag.Bool("enable-leader-election", false, "Enable leader election for the webhook pod") // Global parameters clusterIDFlags := argsutils.NewClusterIDFlags(true, nil) diff --git a/docs/installation/install.md b/docs/installation/install.md index f72598ad7f..4d14297c24 100644 --- a/docs/installation/install.md +++ b/docs/installation/install.md @@ -388,7 +388,6 @@ Before listing all the parameters, we start here with some general consideration The main global flags, besides those concerning the installation of [development versions](InstallationDevelopmentVersions), include: -* `--enable-ha`: enables the support for **high-availability of the Liqo components**, starting two replicas (in an active/standby configuration) of the **gateway** to ensure no cross-cluster connectivity downtime in case one of the replicas is restarted, as well as of the **controller manager**, which embeds the Liqo control plane logic. * `--enable-metrics`: exposes Liqo **metrics** through **Prometheus** (see the dedicated [Prometheus metrics page](/usage/prometheus-metrics.md) for additional details). * `--timeout`: configures the timeout for the completion of the installation/upgrade process. Once expired, the process is aborted and Liqo is rolled back to the previous version. @@ -417,6 +416,12 @@ The main networking flags include: * `--reserved-subnets`: the list of **private CIDRs to be excluded** from the ones used by Liqo to remap remote clusters in case of address conflicts, as already in use (e.g., the subnet of the cluster nodes). The Pod CIDR and the Service CIDR shall not be manually specified, as automatically included in the reserved list. +### High availability components + +Enables the support for **high-availability of the Liqo components**, starting multiple replicas of the same pod in an active/passive fashion. +This ensures that, even after eventual pod restarts or node failures, exactly one replica is always active while the remaining ones run on standby. +Refer to this [page](ServiceContinuityHA) to see which components you can configure in HA and how to set the the number of desired replicas. + (InstallationHelm)= ## Install with Helm diff --git a/docs/usage/service-continuity.md b/docs/usage/service-continuity.md index f1a55d63fc..7da9496293 100644 --- a/docs/usage/service-continuity.md +++ b/docs/usage/service-continuity.md @@ -6,6 +6,22 @@ It reports the main architectural design choices and the options to better handl For simplicity, we consider a simple consumer-provider setup, where the consumer/local cluster offloads an application to a provider/remote cluster. Since a single peering is unidirectional and between two clusters, all the following considerations can be extended to more complex setups involving bidirectional peerings and/or multiple clusters. +(ServiceContinuityHA)= + +## High-availability Liqo components + +Liqo allows to deploy the most critical Liqo components in high availability. +This is achieved by deploying multiple replicas of the same component in an **active/passive** fashion. +This ensures that, even after eventual pod restarts or node failures, exactly one replica is always active while the remaining ones run on standby. + +The supported components (pods) in high availability are: + +- ***liqo-controller-manager*** (active-passive): ensures the Liqo control plane logic is always enforced. The number of replicas is configurable through the Helm value `controllerManager.replicas` +- ***wireguard gateway server and client*** (active-passive): ensures no cross-cluster connectivity downtime. The number of replicas is configurable through the Helm value `networking.gatewayTemplates.replicas` +- ***webhook*** (active-passive): ensures the enforcement of Liqo resources is responsive, as at least one liqo webhook pod is always active and reachable from its Service. The number of replicas is configurable through the Helm value `webhook.replicas` +- ***virtual-kubelet*** (active-passive): improves VirtualNodes responsiveness when the leading virtual-kubelet has some failures or is restarted. The number of replicas is configurable through the Helm value `virtualKubelet.replicas` +- ***ipam*** (active-passive): ensures IPs and Networks management is always up and responsive. The number of replicas is configurable through the Helm value `ipam.internal.replicas` + ## Resilience to cluster failures/unavailability Liqo performs periodic checks to ensure the availability and readiness of all peered clusters. @@ -102,16 +118,3 @@ Enabling the controller can have some minor drawbacks: when the pod is force-del This means that in the (rare) case that the failed node becomes ready again and without an OS restart, the containers in the pod will not be gracefully deleted by the API server because the entry is not in the database anymore. The side effect is that zombie processes associated with the pod will remain in the node until the next OS restart or manual cleanup. ``` - -## High-availability Liqo components - -Liqo allows to deploy the most critical Liqo components in high availability. -This is achieved by deploying multiple replicas of the same component in an **active/standby** fashion. -This ensures that, even after eventual pod restarts or node failures, exactly one replica is always active while the remaining ones run on standby. - -The supported components in high availability are: - -- ***liqo-gateway***: ensures no cross-cluster connectivity downtime. The number of replicas is configurable through the Helm value `gateway.replicas` -- ***liqo-controller-manager***: ensures the Liqo control plane logic is always enforced. The number of replicas is configurable through the Helm value `controllerManager.replicas` - -Look at the [install customization options section](InstallCustomization) for further details on how to configure high availability during Liqo installation. diff --git a/pkg/liqoctl/install/handler.go b/pkg/liqoctl/install/handler.go index 16bd3d1291..b31724b23e 100644 --- a/pkg/liqoctl/install/handler.go +++ b/pkg/liqoctl/install/handler.go @@ -94,7 +94,6 @@ type CommonOptions struct { ClusterID liqov1beta1.ClusterID ClusterLabels map[string]string - EnableHA bool EnableMetrics bool DisableTelemetry bool @@ -350,11 +349,6 @@ func (o *Options) isRelease() bool { } func (o *Options) preProviderValues() map[string]interface{} { - replicas := 1 - if o.EnableHA { - replicas = 2 - } - return map[string]interface{}{ "tag": o.Version, @@ -375,10 +369,6 @@ func (o *Options) preProviderValues() map[string]interface{} { }, }, - "controllerManager": map[string]interface{}{ - "replicas": float64(replicas), - }, - "ipam": map[string]interface{}{ "podCIDR": o.PodCIDR, "serviceCIDR": o.ServiceCIDR,