Skip to content

Commit

Permalink
auditd: Fix kernel deadlock after ENOBUFS (elastic#26032)
Browse files Browse the repository at this point in the history
This fixes a deadlock when the netlink channel is congested
(initialization fails with "no buffer space available" / errno=ENOBUFS).

Closes elastic#26031
  • Loading branch information
adriansr authored Jun 7, 2021
1 parent 9766ad3 commit 551baaa
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- system/socket: Fixed start failure when run under config reloader. {issue}20851[20851] {pull}21693[21693]
- system/socket: Having some CPUs unavailable to Auditbeat could cause startup errors or event loss. {pull}22827[22827]
- Note incompatibility of system/socket on ARM. {pull}23381[23381]
- auditd: Fix kernel deadlock when netlink congestion causes "no buffer space available" errors. {issue}26031[26031] {pull}26032[26032]

*Filebeat*

Expand Down
50 changes: 45 additions & 5 deletions auditbeat/module/auditd/audit_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const (

lostEventsUpdateInterval = time.Second * 15
maxDefaultStreamBufferConsumers = 4

setPIDMaxRetries = 5
)

type backpressureStrategy uint8
Expand Down Expand Up @@ -137,10 +139,32 @@ func newAuditClient(c *Config, log *logp.Logger) (*libaudit.AuditClient, error)
return libaudit.NewAuditClient(nil)
}

func closeAuditClient(client *libaudit.AuditClient) error {
discard := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
return nil, nil
}
// Drain the netlink channel in parallel to Close() to prevent a deadlock.
// This goroutine will terminate once receive from netlink errors (EBADF,
// EBADFD, or any other error). This happens because the fd is closed.
go func() {
for {
_, err := client.Netlink.Receive(true, discard)
switch err {
case nil, syscall.EINTR:
case syscall.EAGAIN:
time.Sleep(50 * time.Millisecond)
default:
return
}
}
}()
return client.Close()
}

// Run initializes the audit client and receives audit messages from the
// kernel until the reporter's done channel is closed.
func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
defer ms.client.Close()
defer closeAuditClient(ms.client)

if err := ms.addRules(reporter); err != nil {
reporter.Error(err)
Expand All @@ -164,7 +188,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
go func() {
defer func() { // Close the most recently allocated "client" instance.
if client != nil {
client.Close()
closeAuditClient(client)
}
}()
timer := time.NewTicker(lostEventsUpdateInterval)
Expand All @@ -178,7 +202,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
ms.updateKernelLostMetric(status.Lost)
} else {
ms.log.Error("get status request failed:", err)
if err = client.Close(); err != nil {
if err = closeAuditClient(client); err != nil {
ms.log.Errorw("Error closing audit monitoring client", "error", err)
}
client, err = libaudit.NewAuditClient(nil)
Expand Down Expand Up @@ -233,7 +257,7 @@ func (ms *MetricSet) addRules(reporter mb.PushReporterV2) error {
if err != nil {
return errors.Wrap(err, "failed to create audit client for adding rules")
}
defer client.Close()
defer closeAuditClient(client)

// Don't attempt to change configuration if audit rules are locked (enabled == 2).
// Will result in EPERM.
Expand Down Expand Up @@ -350,10 +374,12 @@ func (ms *MetricSet) initClient() error {
return errors.Wrap(err, "failed to enable auditing in the kernel")
}
}

if err := ms.client.WaitForPendingACKs(); err != nil {
return errors.Wrap(err, "failed to wait for ACKs")
}
if err := ms.client.SetPID(libaudit.WaitForReply); err != nil {

if err := ms.setPID(setPIDMaxRetries); err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EEXIST && status.PID != 0 {
return fmt.Errorf("failed to set audit PID. An audit process is already running (PID %d)", status.PID)
}
Expand All @@ -362,6 +388,20 @@ func (ms *MetricSet) initClient() error {
return nil
}

func (ms *MetricSet) setPID(retries int) (err error) {
if err = ms.client.SetPID(libaudit.WaitForReply); err == nil || errors.Cause(err) != syscall.ENOBUFS || retries == 0 {
return err
}
// At this point the netlink channel is congested (ENOBUFS).
// Drain and close the client, then retry with a new client.
closeAuditClient(ms.client)
if ms.client, err = newAuditClient(&ms.config, ms.log); err != nil {
return errors.Wrapf(err, "failed to recover from ENOBUFS")
}
ms.log.Info("Recovering from ENOBUFS ...")
return ms.setPID(retries - 1)
}

func (ms *MetricSet) updateKernelLostMetric(lost uint32) {
if !ms.kernelLost.enabled {
return
Expand Down

0 comments on commit 551baaa

Please sign in to comment.