From 8a58a4b8a092ca196de1b4bf0a3a6488c7d37a7a Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Fri, 17 Jun 2022 10:55:02 +0200 Subject: [PATCH 1/5] Evenly spread queriers across available nodes --- production/ksonnet/loki/querier.libsonnet | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 0da6fbee01b22..33424670c8e95 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -26,6 +26,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ]) else {}, local deployment = k.apps.v1.deployment, + local topologySpreadConstraints = k.core.v1.topologySpreadConstraint, querier_deployment: if !$._config.stateful_queriers then deployment.new('querier', 3, [$.querier_container]) + @@ -35,7 +36,13 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + + # Evenly spread queriers among available nodes. + deployment.spec.template.spec.withTopologySpreadConstraints( + topologySpreadConstraints.labelSelector.withMatchLabels({ name: 'querier' }) + + topologySpreadConstraints.withTopologyKey('kubernetes.io/hostname') + + topologySpreadConstraints.withWhenUnsatisfiable('ScheduleAnyway') + + topologySpreadConstraints.withMaxSkew(1), + ) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) else {}, From b6e86f8a51e20d8c098dfa03f3daf1f4b87a0850 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Fri, 17 Jun 2022 11:31:34 +0200 Subject: [PATCH 2/5] Fix lint issue --- production/ksonnet/loki/querier.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 33424670c8e95..2c1ca68edd8b2 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -36,7 +36,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - # Evenly spread queriers among available nodes. + // Evenly spread queriers among available nodes. deployment.spec.template.spec.withTopologySpreadConstraints( topologySpreadConstraints.labelSelector.withMatchLabels({ name: 'querier' }) + topologySpreadConstraints.withTopologyKey('kubernetes.io/hostname') + From 96ee6bb7e82e1e8da0202f6cbfdaca0cce4c0ca0 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Fri, 17 Jun 2022 11:42:47 +0200 Subject: [PATCH 3/5] Add entry to the CHANGELOG and the Upgrade Guide --- CHANGELOG.md | 1 + docs/sources/upgrading/_index.md | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eabc5fdfe5326..ce40fac514bf8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +* [6415](https://github.com/grafana/loki/pull/6415) **salvacorts** Evenly spread queriers across kubernetes nodes. * [6372](https://github.com/grafana/loki/pull/6372) **splitice**: Add support for numbers in JSON fields. * [6105](https://github.com/grafana/loki/pull/6105) **rutgerke** Export metrics for the Promtail journal target. * [6099](https://github.com/grafana/loki/pull/6099) **cstyan**: Drop lines with malformed JSON in Promtail JSON pipeline stage. diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md index 6a18de94d4798..6a2fb28f06db7 100644 --- a/docs/sources/upgrading/_index.md +++ b/docs/sources/upgrading/_index.md @@ -33,6 +33,12 @@ The output is incredibly verbose as it shows the entire internal config struct u ### Loki +#### Evenly spread queriers across kubernetes nodes + +We now evenly spread queriers across the available kubernetes nodes, but allowing more than one querier to be scheduled into the same node. +If you want to keep running up to one querier per node, you will need to revert the changes for `production/ksonnet/loki/querier.libsonnet` +made at [6415](https://github.com/grafana/loki/pull/6415). + #### Implementation of unwrapped `rate` aggregation changed The implementation of the `rate()` aggregation function changed back to the previous implemention prior to [#5013](https://github.com/grafana/loki/pulls/5013). From 5628d6e10350451c1599b037252fdc3214f4ec1b Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Mon, 20 Jun 2022 12:16:11 +0200 Subject: [PATCH 4/5] Make topology spread configurable --- docs/sources/upgrading/_index.md | 3 +-- production/ksonnet/loki/config.libsonnet | 7 +++++++ production/ksonnet/loki/querier.libsonnet | 19 +++++++++++-------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md index 6a2fb28f06db7..07e33932b2062 100644 --- a/docs/sources/upgrading/_index.md +++ b/docs/sources/upgrading/_index.md @@ -36,8 +36,7 @@ The output is incredibly verbose as it shows the entire internal config struct u #### Evenly spread queriers across kubernetes nodes We now evenly spread queriers across the available kubernetes nodes, but allowing more than one querier to be scheduled into the same node. -If you want to keep running up to one querier per node, you will need to revert the changes for `production/ksonnet/loki/querier.libsonnet` -made at [6415](https://github.com/grafana/loki/pull/6415). +If you want to keep running up to one querier per node, set `$._config.querier.use_topology_spread` to false. #### Implementation of unwrapped `rate` aggregation changed diff --git a/production/ksonnet/loki/config.libsonnet b/production/ksonnet/loki/config.libsonnet index e2f02285decd6..4ff9f0e092f0a 100644 --- a/production/ksonnet/loki/config.libsonnet +++ b/production/ksonnet/loki/config.libsonnet @@ -47,6 +47,13 @@ // A higher value will lead to a querier trying to process more requests than there are available // cores and will result in scheduling delays. concurrency: 4, + + // If use_topology_spread is true, queriers can run on nodes already running queriers but will be + // spread through the available nodes using a TopologySpreadConstraints with a max skew + // of topology_spread_max_skew. + // If use_topology_spread is false, queriers will not be scheduled on nodes already running queriers. + use_topology_spread: true, + topology_spread_max_skew: 1, }, queryFrontend: { diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 2c1ca68edd8b2..ebf0526c083dc 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -36,15 +36,18 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - // Evenly spread queriers among available nodes. - deployment.spec.template.spec.withTopologySpreadConstraints( - topologySpreadConstraints.labelSelector.withMatchLabels({ name: 'querier' }) + - topologySpreadConstraints.withTopologyKey('kubernetes.io/hostname') + - topologySpreadConstraints.withWhenUnsatisfiable('ScheduleAnyway') + - topologySpreadConstraints.withMaxSkew(1), - ) + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + if $._config.querier.use_topology_spread then + deployment.spec.template.spec.withTopologySpreadConstraints( + // Evenly spread queriers among available nodes. + topologySpreadConstraints.labelSelector.withMatchLabels({ name: 'querier' }) + + topologySpreadConstraints.withTopologyKey('kubernetes.io/hostname') + + topologySpreadConstraints.withWhenUnsatisfiable('ScheduleAnyway') + + topologySpreadConstraints.withMaxSkew($._config.querier.topology_spread_max_skew), + ) + else + k.util.antiAffinity else {}, // PVC for queriers when running as statefulsets From 1b15fe61806667dc31c0fcf4650a3fc19f35d4c4 Mon Sep 17 00:00:00 2001 From: Salva Corts Date: Mon, 20 Jun 2022 12:34:01 +0200 Subject: [PATCH 5/5] Apply CR feedback --- docs/sources/upgrading/_index.md | 2 +- production/ksonnet/loki/config.libsonnet | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md index 07e33932b2062..84d07acf8dfeb 100644 --- a/docs/sources/upgrading/_index.md +++ b/docs/sources/upgrading/_index.md @@ -36,7 +36,7 @@ The output is incredibly verbose as it shows the entire internal config struct u #### Evenly spread queriers across kubernetes nodes We now evenly spread queriers across the available kubernetes nodes, but allowing more than one querier to be scheduled into the same node. -If you want to keep running up to one querier per node, set `$._config.querier.use_topology_spread` to false. +If you want to run at most a single querier per node, set `$._config.querier.use_topology_spread` to false. #### Implementation of unwrapped `rate` aggregation changed diff --git a/production/ksonnet/loki/config.libsonnet b/production/ksonnet/loki/config.libsonnet index 4ff9f0e092f0a..8764465fcdfb5 100644 --- a/production/ksonnet/loki/config.libsonnet +++ b/production/ksonnet/loki/config.libsonnet @@ -51,6 +51,7 @@ // If use_topology_spread is true, queriers can run on nodes already running queriers but will be // spread through the available nodes using a TopologySpreadConstraints with a max skew // of topology_spread_max_skew. + // See: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ // If use_topology_spread is false, queriers will not be scheduled on nodes already running queriers. use_topology_spread: true, topology_spread_max_skew: 1,