Skip to content

Commit

Permalink
Add MaxSurge and MaxUnavailable strategy to all loki k8 workloads.
Browse files Browse the repository at this point in the history
This fixes couple of issues.
1. By default these configs are 25% in k8, meaning during rollout 25% of pods are allowed to shutdown immediately.
2. Due to (1), during graceful shutdown process, 25% of all the pods access consul to `unregister()` from shared key value.

(2) makes CAS rate of underlying KV store high (leads to lots of retry and failing) sometimes failing to unregister leaving the ring "unhealthy"

Also this PR make these configs consistent across all k8 workloads.

More details: grafana/dskit#117
  • Loading branch information
kavirajk committed Jan 24, 2022
1 parent 50ca4d5 commit 421ac2a
Show file tree
Hide file tree
Showing 9 changed files with 33 additions and 11 deletions.
4 changes: 3 additions & 1 deletion production/ksonnet/loki/boltdb_shipper.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@
k.util.configVolumeMount('loki', '/etc/loki/config') +
k.util.configVolumeMount('overrides', '/etc/loki/overrides') +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},
}
4 changes: 3 additions & 1 deletion production/ksonnet/loki/distributor.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
$._config.overrides_configmap_mount_name,
$._config.overrides_configmap_mount_path,
) +
k.util.antiAffinity,
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1),

distributor_service:
k.util.serviceFor($.distributor_deployment),
Expand Down
4 changes: 3 additions & 1 deletion production/ksonnet/loki/gateway.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
}) +
k.util.configVolumeMount('gateway-config', '/etc/nginx') +
k.util.secretVolumeMount('gateway-secret', '/etc/nginx/secrets', defaultMode=420) +
k.util.antiAffinity,
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1),

gateway_service:
k.util.serviceFor($.gateway_deployment),
Expand Down
4 changes: 3 additions & 1 deletion production/ksonnet/loki/index-gateway.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@
$.config_hash_mixin +
k.util.configVolumeMount('loki', '/etc/loki/config') +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

index_gateway_service: if $._config.use_index_gateway then
Expand Down
4 changes: 3 additions & 1 deletion production/ksonnet/loki/ingester.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
k.util.antiAffinity +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800)
statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

ingester_service:
Expand Down
8 changes: 6 additions & 2 deletions production/ksonnet/loki/querier.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
$._config.overrides_configmap_mount_name,
$._config.overrides_configmap_mount_path,
) +
k.util.antiAffinity
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

// PVC for queriers when running as statefulsets
Expand All @@ -57,7 +59,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
) +
k.util.antiAffinity +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

querier_service:
Expand Down
4 changes: 3 additions & 1 deletion production/ksonnet/loki/query-frontend.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
$._config.overrides_configmap_mount_name,
$._config.overrides_configmap_mount_path,
) +
k.util.antiAffinity,
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1),

local service = k.core.v1.service,

Expand Down
4 changes: 3 additions & 1 deletion production/ksonnet/loki/query-scheduler.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
$._config.overrides_configmap_mount_name,
$._config.overrides_configmap_mount_path,
) +
k.util.antiAffinity
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

local service = k.core.v1.service,
Expand Down
8 changes: 6 additions & 2 deletions production/ksonnet/loki/ruler.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ local k = import 'ksonnet-util/kausal.libsonnet';
$._config.overrides_configmap_mount_name,
$._config.overrides_configmap_mount_path,
) +
k.util.antiAffinity
k.util.antiAffinity +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},

ruler_service: if !$._config.ruler_enabled
Expand Down Expand Up @@ -75,6 +77,8 @@ local k = import 'ksonnet-util/kausal.libsonnet';
) +
k.util.antiAffinity +
statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') +
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the group ID assigned to Loki in the Dockerfile
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) +
statefulSet.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1)
else {},
}

0 comments on commit 421ac2a

Please sign in to comment.