From 21f2c283324b79accb7b31939b3d0f2ff5ab8f6b Mon Sep 17 00:00:00 2001 From: sebgl Date: Mon, 14 Oct 2019 13:08:00 +0200 Subject: [PATCH] Reconcile certificates every 10 hour to avoid a timer memory leak We have an issue where the underlying timer used by client-go worker queue implementation stays in memory until it expires. Since one gets created at every reconciliation attempt, we end up with a big bunch of timers in memory that will expire in 365 days by default. To mitigate the memory leak, let's wait for no more than 10 hours to reconcile. For more details, see https://github.com/elastic/cloud-on-k8s/issues/1984. --- .../apmserver/certificates/reconcile.go | 2 +- .../common/certificates/expiration.go | 19 +++++++++- .../common/certificates/expiration_test.go | 37 ++++++++++++++++++- .../certificates/ca_reconcile.go | 4 +- .../kibana/certificates/reconcile.go | 2 +- 5 files changed, 56 insertions(+), 8 deletions(-) diff --git a/pkg/controller/apmserver/certificates/reconcile.go b/pkg/controller/apmserver/certificates/reconcile.go index 60fe314ba4..b6d973b315 100644 --- a/pkg/controller/apmserver/certificates/reconcile.go +++ b/pkg/controller/apmserver/certificates/reconcile.go @@ -48,7 +48,7 @@ func Reconcile( // handle CA expiry via requeue results.WithResult(reconcile.Result{ - RequeueAfter: certificates.ShouldRotateIn(time.Now(), httpCa.Cert.NotAfter, rotation.RotateBefore), + RequeueAfter: certificates.ShouldReconcileIn(time.Now(), httpCa.Cert.NotAfter, rotation.RotateBefore), }) // discover and maybe reconcile for the http certificates to use diff --git a/pkg/controller/common/certificates/expiration.go b/pkg/controller/common/certificates/expiration.go index 087852bcab..24ec5bee5f 100644 --- a/pkg/controller/common/certificates/expiration.go +++ b/pkg/controller/common/certificates/expiration.go @@ -12,6 +12,8 @@ const ( // DefaultRotateBefore defines how long before expiration a certificate // should be re-issued DefaultRotateBefore = 24 * time.Hour + // MaxReconciliationPeriod defines the maximum period of time between 2 certificates rotation. + MaxReconciliationPeriod = 10 * time.Hour ) // RotationParams defines validity and a safety margin for certificate rotation. @@ -22,9 +24,9 @@ type RotationParams struct { RotateBefore time.Duration } -// ShouldRotateIn computes the duration after which a certificate rotation should be scheduled +// shouldRotateIn computes the duration after which a certificate rotation should be scheduled // in order for the CA cert to be rotated before it expires. -func ShouldRotateIn(now time.Time, certExpiration time.Time, caCertRotateBefore time.Duration) time.Duration { +func shouldRotateIn(now time.Time, certExpiration time.Time, caCertRotateBefore time.Duration) time.Duration { // make sure we are past the safety margin when rotating, by making it a little bit shorter safetyMargin := caCertRotateBefore - 1*time.Second requeueTime := certExpiration.Add(-safetyMargin) @@ -35,3 +37,16 @@ func ShouldRotateIn(now time.Time, certExpiration time.Time, caCertRotateBefore } return requeueIn } + +// ShouldReconcileIn returns the duration after which a reconciliation should be done +// to make sure certificates do not expire. +func ShouldReconcileIn(now time.Time, certExpiration time.Time, caCertRotateBefore time.Duration) time.Duration { + rotateIn := shouldRotateIn(now, certExpiration, caCertRotateBefore) + if rotateIn > MaxReconciliationPeriod { + // We don't want to wait for rotateIn to be reached, because of an underlying leaky timer issue. + // See https://github.com/elastic/cloud-on-k8s/issues/1984. + // TODO: remove once https://github.com/kubernetes/client-go/issues/701 is fixed. + return MaxReconciliationPeriod + } + return rotateIn +} diff --git a/pkg/controller/common/certificates/expiration_test.go b/pkg/controller/common/certificates/expiration_test.go index e92c73376f..f335adcfc3 100644 --- a/pkg/controller/common/certificates/expiration_test.go +++ b/pkg/controller/common/certificates/expiration_test.go @@ -9,7 +9,7 @@ import ( "time" ) -func TestShouldRotateIn(t *testing.T) { +func Test_shouldRotateIn(t *testing.T) { now := time.Now() tests := []struct { name string @@ -49,9 +49,42 @@ func TestShouldRotateIn(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := ShouldRotateIn(tt.now, tt.certExpiration, tt.caCertRotateBefore); got != tt.want { + if got := shouldRotateIn(tt.now, tt.certExpiration, tt.caCertRotateBefore); got != tt.want { t.Errorf("shouldRequeueIn() = %v, want %v", got, tt.want) } }) } } + +func TestShouldReconcileIn(t *testing.T) { + now := time.Now() + tests := []struct { + name string + now time.Time + certExpiration time.Time + caCertRotateBefore time.Duration + want time.Duration + }{ + { + name: "rotation scheduled in less than 10 hours: requeue at that time", + now: now, + certExpiration: now.Add(10 * time.Hour), + caCertRotateBefore: 1 * time.Hour, + want: 9*time.Hour + 1*time.Second, + }, + { + name: "rotation scheduled in more than 10 hours: requeue in 10 hours", + now: now, + certExpiration: now.Add(30 * time.Hour), + caCertRotateBefore: 1 * time.Hour, + want: 10 * time.Hour, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ShouldReconcileIn(tt.now, tt.certExpiration, tt.caCertRotateBefore); got != tt.want { + t.Errorf("ShouldReconcileIn() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/controller/elasticsearch/certificates/ca_reconcile.go b/pkg/controller/elasticsearch/certificates/ca_reconcile.go index f7671ec54f..a0be8567de 100644 --- a/pkg/controller/elasticsearch/certificates/ca_reconcile.go +++ b/pkg/controller/elasticsearch/certificates/ca_reconcile.go @@ -59,7 +59,7 @@ func Reconcile( // make sure to requeue before the CA cert expires results.WithResult(reconcile.Result{ - RequeueAfter: certificates.ShouldRotateIn(time.Now(), httpCA.Cert.NotAfter, caRotation.RotateBefore), + RequeueAfter: certificates.ShouldReconcileIn(time.Now(), httpCA.Cert.NotAfter, caRotation.RotateBefore), }) // discover and maybe reconcile for the http certificates to use @@ -96,7 +96,7 @@ func Reconcile( } // make sure to requeue before the CA cert expires results.WithResult(reconcile.Result{ - RequeueAfter: certificates.ShouldRotateIn(time.Now(), transportCA.Cert.NotAfter, caRotation.RotateBefore), + RequeueAfter: certificates.ShouldReconcileIn(time.Now(), transportCA.Cert.NotAfter, caRotation.RotateBefore), }) // reconcile transport public certs secret: diff --git a/pkg/controller/kibana/certificates/reconcile.go b/pkg/controller/kibana/certificates/reconcile.go index 95c9ef746d..89bfcdda13 100644 --- a/pkg/controller/kibana/certificates/reconcile.go +++ b/pkg/controller/kibana/certificates/reconcile.go @@ -48,7 +48,7 @@ func Reconcile( // handle CA expiry via requeue results.WithResult(reconcile.Result{ - RequeueAfter: certificates.ShouldRotateIn(time.Now(), httpCa.Cert.NotAfter, rotation.RotateBefore), + RequeueAfter: certificates.ShouldReconcileIn(time.Now(), httpCa.Cert.NotAfter, rotation.RotateBefore), }) // discover and maybe reconcile for the http certificates to use