Skip to content

Commit

Permalink
Restart Beat when certificates or key files are modified (#34416)
Browse files Browse the repository at this point in the history
This commit enables any Beat to reexec when a output CA certificate file is modified. The main use case we're covering here is enabling "CA certificates reload" when running Metricbeat on ECK to monitor Elasticsearch. At the moment this feature is only supported on Linux and MacOS.

libbeat/management/management.go is also refactored and some unused bits of code are removed.
  • Loading branch information
belimawr authored Feb 28, 2023
1 parent 5c9e055 commit 6bf9c83
Show file tree
Hide file tree
Showing 29 changed files with 644 additions and 29 deletions.
25 changes: 25 additions & 0 deletions auditbeat/auditbeat.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,15 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""


# Enables restarting auditbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down Expand Up @@ -720,6 +729,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting auditbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# The number of times to retry publishing an event after a publishing failure.
# After the specified number of retries, the events are typically dropped.
# Some Beats, such as Filebeat and Winlogbeat, ignore the max_retries setting
Expand Down Expand Up @@ -925,6 +942,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting auditbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down
12 changes: 11 additions & 1 deletion filebeat/beater/filebeat.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,17 @@ func (fb *Filebeat) Run(b *beat.Beat) error {
}

// Stop the manager and stop the connection to any dependent services.
b.Manager.Stop()
// The Manager started to have a working implementation when
// https://github.com/elastic/beats/pull/34416 was merged.
// This is intended to enable TLS certificates reload on a long
// running Beat.
//
// However calling b.Manager.Stop() here messes up the behavior of the
// --once flag because it makes Filebeat exit early.
// So if --once is passed, we don't call b.Manager.Stop().
if !*once {
b.Manager.Stop()
}

return nil
}
Expand Down
25 changes: 25 additions & 0 deletions filebeat/filebeat.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1664,6 +1664,15 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""


# Enables restarting filebeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down Expand Up @@ -1803,6 +1812,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting filebeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# The number of times to retry publishing an event after a publishing failure.
# After the specified number of retries, the events are typically dropped.
# Some Beats, such as Filebeat and Winlogbeat, ignore the max_retries setting
Expand Down Expand Up @@ -2008,6 +2025,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting filebeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down
25 changes: 25 additions & 0 deletions heartbeat/heartbeat.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,15 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""


# Enables restarting heartbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down Expand Up @@ -812,6 +821,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting heartbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# The number of times to retry publishing an event after a publishing failure.
# After the specified number of retries, the events are typically dropped.
# Some Beats, such as Filebeat and Winlogbeat, ignore the max_retries setting
Expand Down Expand Up @@ -1017,6 +1034,14 @@ output.elasticsearch:
# only one in the list. Then the normal SSL validation happens.
#ssl.ca_trusted_fingerprint: ""

# Enables restarting heartbeat if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down
9 changes: 9 additions & 0 deletions libbeat/_meta/config/output-elasticsearch.reference.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ output.elasticsearch:
#allow_older_versions: false

{{include "ssl.reference.yml.tmpl" . | indent 2 }}

# Enables restarting {{.BeatName}} if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down
8 changes: 8 additions & 0 deletions libbeat/_meta/config/output-kafka.reference.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@
#client_id: beats

{{include "ssl.reference.yml.tmpl" . | indent 2 }}
# Enables restarting {{.BeatName}} if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# Enable Kerberos support. Kerberos is automatically enabled if any Kerberos setting is set.
#kerberos.enabled: true

Expand Down
8 changes: 8 additions & 0 deletions libbeat/_meta/config/output-logstash.reference.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@
#proxy_use_local_resolver: false

{{include "ssl.reference.yml.tmpl" . | indent 2 }}
# Enables restarting {{.BeatName}} if any file listed by `key`,
# `certificate`, or `certificate_authorities` is modified.
# This feature IS NOT supported on Windows.
#ssl.restart_on_cert_change.enabled: false

# Period to scan for changes on CA certificate files
#ssl.restart_on_cert_change.period: 1m

# The number of times to retry publishing an event after a publishing failure.
# After the specified number of retries, the events are typically dropped.
# Some Beats, such as Filebeat and Winlogbeat, ignore the max_retries setting
Expand Down
140 changes: 139 additions & 1 deletion libbeat/cmd/instance/beat.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ import (
"github.com/elastic/beats/v7/libbeat/version"
"github.com/elastic/elastic-agent-libs/config"
"github.com/elastic/elastic-agent-libs/file"
"github.com/elastic/elastic-agent-libs/filewatcher"
"github.com/elastic/elastic-agent-libs/keystore"
kbn "github.com/elastic/elastic-agent-libs/kibana"
"github.com/elastic/elastic-agent-libs/logp"
Expand All @@ -76,6 +77,7 @@ import (
"github.com/elastic/elastic-agent-libs/monitoring/report/buffer"
"github.com/elastic/elastic-agent-libs/paths"
svc "github.com/elastic/elastic-agent-libs/service"
"github.com/elastic/elastic-agent-libs/transport/tlscommon"
libversion "github.com/elastic/elastic-agent-libs/version"
"github.com/elastic/elastic-agent-system-metrics/metric/system/host"
metricreport "github.com/elastic/elastic-agent-system-metrics/report"
Expand All @@ -96,6 +98,9 @@ type Beat struct {
processing processing.Supporter

InputQueueSize int // Size of the producer queue used by most queues.

// shouldReexec is a flag to indicate the Beat should restart
shouldReexec bool
}

type beatConfig struct {
Expand Down Expand Up @@ -139,6 +144,32 @@ type beatConfig struct {
TimestampPrecision *config.C `config:"timestamp"`
}

type certReloadConfig struct {
tlscommon.Config `config:",inline" yaml:",inline"`
Reload cfgfile.Reload `config:"restart_on_cert_change" yaml:"restart_on_cert_change"`
}

func (c certReloadConfig) Validate() error {
if c.Reload.Period < time.Second {
return errors.New("'restart_on_cert_change.period' must be equal or greather than 1s")
}

if c.Reload.Enabled && runtime.GOOS == "windows" {
return errors.New("'restart_on_cert_change' is not supported on Windows")
}

return nil
}

func defaultCertReloadConfig() certReloadConfig {
return certReloadConfig{
Reload: cfgfile.Reload{
Enabled: false,
Period: time.Minute,
},
}
}

var debugf = logp.MakeDebug("beat")

func init() {
Expand Down Expand Up @@ -488,7 +519,19 @@ func (b *Beat) launch(settings Settings, bt beat.Creator) error {
// Allow the manager to stop a currently running beats out of bound.
b.Manager.SetStopCallback(beater.Stop)

return beater.Run(&b.Beat)
err = beater.Run(&b.Beat)
if b.shouldReexec {
if err := b.reexec(); err != nil {
return fmt.Errorf("could not restart %s: %w", b.Info.Beat, err)
}
}

return err
}

// reexec restarts the Beat, it calls the OS-specific implementation.
func (b *Beat) reexec() error {
return b.doReexec()
}

// registerMetrics registers metrics with the internal monitoring API. This data
Expand Down Expand Up @@ -980,11 +1023,106 @@ func (b *Beat) makeOutputFactory(
}
}

func (b *Beat) reloadOutputOnCertChange(cfg config.Namespace) error {
logger := logp.L().Named("ssl.cert.reloader")
// Here the output is created and we have access to the Beat struct (with the manager)
// as a workaround we can unpack the new settings and trigger the reload-watcher from here

// We get an output config, so we extract the 'SSL' bit from it
rawTLSCfg, err := cfg.Config().Child("ssl", -1)
if err != nil {
var e ucfg.Error
if errors.As(err, &e) {
if errors.Is(e.Reason(), ucfg.ErrMissing) {
// if the output configuration does not contain a `ssl` section
// do nothing and return no error
return nil
}
}
return fmt.Errorf("could not extract the 'ssl' section of the output config: %w", err)
}

extendedTLSCfg := defaultCertReloadConfig()
if err := rawTLSCfg.Unpack(&extendedTLSCfg); err != nil {
return fmt.Errorf("unpacking 'ssl' config: %w", err)
}

if !extendedTLSCfg.Reload.Enabled {
return nil
}
logger.Debug("exit on CA certs change enabled")

possibleFilesToWatch := append(
extendedTLSCfg.CAs,
extendedTLSCfg.Certificate.Certificate,
extendedTLSCfg.Certificate.Key,
)

filesToWatch := []string{}
for _, f := range possibleFilesToWatch {
if f == "" {
continue
}
if tlscommon.IsPEMString(f) {
// That's an embedded cert, we're only interested in files
continue
}

logger.Debugf("watching '%s' for changes", f)
filesToWatch = append(filesToWatch, f)
}

// If there are no files to watch, don't do anything.
if len(filesToWatch) == 0 {
logger.Debug("no files to watch, filewatcher will not be started")
return nil
}

watcher := filewatcher.New(filesToWatch...)
// Ignore the first scan as it will always return
// true for files changed. The output has not been
// started yet, so even if the files have changed since
// the Beat started, they don't need to be reloaded
_, _, _ = watcher.Scan()

// Watch for file changes while the Beat is alive
go func() {
//nolint:staticcheck // this is an endless function
ticker := time.Tick(extendedTLSCfg.Reload.Period)

for {
<-ticker
files, changed, err := watcher.Scan()
if err != nil {
logger.Warnf("could not scan certificate files: %s", err.Error())
}

if changed {
logger.Infof(
"some of the following files have been modified: %v, restarting %s.",
files, b.Info.Beat)

b.shouldReexec = true
b.Manager.Stop()

// we're done, finish the goroutine just for the sake of it
return
}
}
}()

return nil
}

func (b *Beat) createOutput(stats outputs.Observer, cfg config.Namespace) (outputs.Group, error) {
if !cfg.IsSet() {
return outputs.Group{}, nil
}

if err := b.reloadOutputOnCertChange(cfg); err != nil {
return outputs.Group{}, fmt.Errorf("could not setup output certificates reloader: %w", err)
}

return outputs.Load(b.IdxSupporter, b.Info, stats, cfg.Name(), cfg.Config())
}

Expand Down
Loading

0 comments on commit 6bf9c83

Please sign in to comment.