diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f6f2b799cf9..b30d3bcc039a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +* [6415](https://github.com/grafana/loki/pull/6415) **salvacorts** Evenly spread queriers across kubernetes nodes. * [6410](https://github.com/grafana/loki/pull/6410) **MichelHollands**: Add support for per tenant delete API access enabling. * [6372](https://github.com/grafana/loki/pull/6372) **splitice**: Add support for numbers in JSON fields. * [6105](https://github.com/grafana/loki/pull/6105) **rutgerke** Export metrics for the Promtail journal target. diff --git a/docs/sources/upgrading/_index.md b/docs/sources/upgrading/_index.md index 6a18de94d479..84d07acf8dfe 100644 --- a/docs/sources/upgrading/_index.md +++ b/docs/sources/upgrading/_index.md @@ -33,6 +33,11 @@ The output is incredibly verbose as it shows the entire internal config struct u ### Loki +#### Evenly spread queriers across kubernetes nodes + +We now evenly spread queriers across the available kubernetes nodes, but allowing more than one querier to be scheduled into the same node. +If you want to run at most a single querier per node, set `$._config.querier.use_topology_spread` to false. + #### Implementation of unwrapped `rate` aggregation changed The implementation of the `rate()` aggregation function changed back to the previous implemention prior to [#5013](https://github.com/grafana/loki/pulls/5013). diff --git a/production/ksonnet/loki/config.libsonnet b/production/ksonnet/loki/config.libsonnet index e2f02285decd..8764465fcdfb 100644 --- a/production/ksonnet/loki/config.libsonnet +++ b/production/ksonnet/loki/config.libsonnet @@ -47,6 +47,14 @@ // A higher value will lead to a querier trying to process more requests than there are available // cores and will result in scheduling delays. concurrency: 4, + + // If use_topology_spread is true, queriers can run on nodes already running queriers but will be + // spread through the available nodes using a TopologySpreadConstraints with a max skew + // of topology_spread_max_skew. + // See: https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/ + // If use_topology_spread is false, queriers will not be scheduled on nodes already running queriers. + use_topology_spread: true, + topology_spread_max_skew: 1, }, queryFrontend: { diff --git a/production/ksonnet/loki/querier.libsonnet b/production/ksonnet/loki/querier.libsonnet index 0da6fbee01b2..ebf0526c083d 100644 --- a/production/ksonnet/loki/querier.libsonnet +++ b/production/ksonnet/loki/querier.libsonnet @@ -26,6 +26,7 @@ local k = import 'ksonnet-util/kausal.libsonnet'; ]) else {}, local deployment = k.apps.v1.deployment, + local topologySpreadConstraints = k.core.v1.topologySpreadConstraint, querier_deployment: if !$._config.stateful_queriers then deployment.new('querier', 3, [$.querier_container]) + @@ -35,9 +36,18 @@ local k = import 'ksonnet-util/kausal.libsonnet'; $._config.overrides_configmap_mount_name, $._config.overrides_configmap_mount_path, ) + - k.util.antiAffinity + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + + if $._config.querier.use_topology_spread then + deployment.spec.template.spec.withTopologySpreadConstraints( + // Evenly spread queriers among available nodes. + topologySpreadConstraints.labelSelector.withMatchLabels({ name: 'querier' }) + + topologySpreadConstraints.withTopologyKey('kubernetes.io/hostname') + + topologySpreadConstraints.withWhenUnsatisfiable('ScheduleAnyway') + + topologySpreadConstraints.withMaxSkew($._config.querier.topology_spread_max_skew), + ) + else + k.util.antiAffinity else {}, // PVC for queriers when running as statefulsets