Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

auditd: Fix kernel deadlock after ENOBUFS #26032

Merged
merged 2 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- system/socket: Fixed start failure when run under config reloader. {issue}20851[20851] {pull}21693[21693]
- system/socket: Having some CPUs unavailable to Auditbeat could cause startup errors or event loss. {pull}22827[22827]
- Note incompatibility of system/socket on ARM. {pull}23381[23381]
- auditd: Fix kernel deadlock when netlink congestion causes "no buffer space available" errors. {issue}26031[26031] {pull}26032[26032]

*Filebeat*

Expand Down
50 changes: 45 additions & 5 deletions auditbeat/module/auditd/audit_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const (

lostEventsUpdateInterval = time.Second * 15
maxDefaultStreamBufferConsumers = 4

setPIDMaxRetries = 5
)

type backpressureStrategy uint8
Expand Down Expand Up @@ -137,10 +139,32 @@ func newAuditClient(c *Config, log *logp.Logger) (*libaudit.AuditClient, error)
return libaudit.NewAuditClient(nil)
}

func closeAuditClient(client *libaudit.AuditClient) error {
discard := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
return nil, nil
}
// Drain the netlink channel in parallel to Close() to prevent a deadlock.
// This goroutine will terminate once receive from netlink errors (EBADF,
// EBADFD, or any other error). This happens because the fd is closed.
go func() {
for {
_, err := client.Netlink.Receive(true, discard)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only reader at this point, correct? Do we have to worry about any sort of data races in the case of a successfully started client that also has a read loop, or we don't care because we're shutting down anyway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is the main reader from auditd that may be reading at the same time. In my tests, there's no issue of having both readers in parallel

switch err {
case nil, syscall.EINTR:
case syscall.EAGAIN:
time.Sleep(50 * time.Millisecond)
default:
return
}
}
}()
return client.Close()
}

// Run initializes the audit client and receives audit messages from the
// kernel until the reporter's done channel is closed.
func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
defer ms.client.Close()
defer closeAuditClient(ms.client)

if err := ms.addRules(reporter); err != nil {
reporter.Error(err)
Expand All @@ -164,7 +188,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
go func() {
defer func() { // Close the most recently allocated "client" instance.
if client != nil {
client.Close()
closeAuditClient(client)
}
}()
timer := time.NewTicker(lostEventsUpdateInterval)
Expand All @@ -178,7 +202,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
ms.updateKernelLostMetric(status.Lost)
} else {
ms.log.Error("get status request failed:", err)
if err = client.Close(); err != nil {
if err = closeAuditClient(client); err != nil {
ms.log.Errorw("Error closing audit monitoring client", "error", err)
}
client, err = libaudit.NewAuditClient(nil)
Expand Down Expand Up @@ -233,7 +257,7 @@ func (ms *MetricSet) addRules(reporter mb.PushReporterV2) error {
if err != nil {
return errors.Wrap(err, "failed to create audit client for adding rules")
}
defer client.Close()
defer closeAuditClient(client)

// Don't attempt to change configuration if audit rules are locked (enabled == 2).
// Will result in EPERM.
Expand Down Expand Up @@ -350,10 +374,12 @@ func (ms *MetricSet) initClient() error {
return errors.Wrap(err, "failed to enable auditing in the kernel")
}
}

if err := ms.client.WaitForPendingACKs(); err != nil {
return errors.Wrap(err, "failed to wait for ACKs")
}
if err := ms.client.SetPID(libaudit.WaitForReply); err != nil {

if err := ms.setPID(setPIDMaxRetries); err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EEXIST && status.PID != 0 {
return fmt.Errorf("failed to set audit PID. An audit process is already running (PID %d)", status.PID)
}
Expand All @@ -362,6 +388,20 @@ func (ms *MetricSet) initClient() error {
return nil
}

func (ms *MetricSet) setPID(retries int) (err error) {
if err = ms.client.SetPID(libaudit.WaitForReply); err == nil || errors.Cause(err) != syscall.ENOBUFS || retries == 0 {
return err
}
// At this point the netlink channel is congested (ENOBUFS).
// Drain and close the client, then retry with a new client.
closeAuditClient(ms.client)
adriansr marked this conversation as resolved.
Show resolved Hide resolved
if ms.client, err = newAuditClient(&ms.config, ms.log); err != nil {
return errors.Wrapf(err, "failed to recover from ENOBUFS")
}
ms.log.Info("Recovering from ENOBUFS ...")
return ms.setPID(retries - 1)
}

func (ms *MetricSet) updateKernelLostMetric(lost uint32) {
if !ms.kernelLost.enabled {
return
Expand Down