Skip to content

Commit

Permalink
[7.12](backport elastic#26032) auditd: Fix kernel deadlock after ENOB…
Browse files Browse the repository at this point in the history
…UFS (elastic#26173)

This fixes a deadlock when the netlink channel is congested
(initialization fails with "no buffer space available" / errno=ENOBUFS).

Closes elastic#26031

(cherry picked from commit 551baaa)
  • Loading branch information
mergify[bot] committed Jun 9, 2021
1 parent 42abba1 commit 8e5336a
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- system/package: Fix an error that can occur while trying to persist package metadata. {issue}18536[18536] {pull}18887[18887]
- system/socket: Fix dataset using 100% CPU and becoming unresponsive in some scenarios. {pull}19033[19033] {pull}19764[19764]
- system/socket: Fixed tracking of long-running connections. {pull}19033[19033]
- Fix issue with m365_defender, when parsing incidents that has no alerts attached: {pull}25421[25421]
- auditd: Fix kernel deadlock when netlink congestion causes "no buffer space available" errors. {issue}26031[26031] {pull}26032[26032]

*Filebeat*

Expand Down Expand Up @@ -176,6 +176,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- Drop aws.vpcflow.pkt_srcaddr and aws.vpcflow.pkt_dstaddr when equal to "-". {pull}22721[22721] {issue}22716[22716]
- Remove space from field `sophos.xg.trans_src_ ip`. {issue}25154[25154] {pull}25250[25250]
- Fix `checkpoint.action_reason` when its a string, not a Long. {issue}25575[25575] {pull}25609[25609]
- Fix issue with m365_defender, when parsing incidents that has no alerts attached: {pull}25421[25421]

*Heartbeat*

Expand Down
50 changes: 45 additions & 5 deletions auditbeat/module/auditd/audit_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const (

lostEventsUpdateInterval = time.Second * 15
maxDefaultStreamBufferConsumers = 4

setPIDMaxRetries = 5
)

type backpressureStrategy uint8
Expand Down Expand Up @@ -137,10 +139,32 @@ func newAuditClient(c *Config, log *logp.Logger) (*libaudit.AuditClient, error)
return libaudit.NewAuditClient(nil)
}

func closeAuditClient(client *libaudit.AuditClient) error {
discard := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
return nil, nil
}
// Drain the netlink channel in parallel to Close() to prevent a deadlock.
// This goroutine will terminate once receive from netlink errors (EBADF,
// EBADFD, or any other error). This happens because the fd is closed.
go func() {
for {
_, err := client.Netlink.Receive(true, discard)
switch err {
case nil, syscall.EINTR:
case syscall.EAGAIN:
time.Sleep(50 * time.Millisecond)
default:
return
}
}
}()
return client.Close()
}

// Run initializes the audit client and receives audit messages from the
// kernel until the reporter's done channel is closed.
func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
defer ms.client.Close()
defer closeAuditClient(ms.client)

if err := ms.addRules(reporter); err != nil {
reporter.Error(err)
Expand All @@ -164,7 +188,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
go func() {
defer func() { // Close the most recently allocated "client" instance.
if client != nil {
client.Close()
closeAuditClient(client)
}
}()
timer := time.NewTicker(lostEventsUpdateInterval)
Expand All @@ -178,7 +202,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
ms.updateKernelLostMetric(status.Lost)
} else {
ms.log.Error("get status request failed:", err)
if err = client.Close(); err != nil {
if err = closeAuditClient(client); err != nil {
ms.log.Errorw("Error closing audit monitoring client", "error", err)
}
client, err = libaudit.NewAuditClient(nil)
Expand Down Expand Up @@ -233,7 +257,7 @@ func (ms *MetricSet) addRules(reporter mb.PushReporterV2) error {
if err != nil {
return errors.Wrap(err, "failed to create audit client for adding rules")
}
defer client.Close()
defer closeAuditClient(client)

// Don't attempt to change configuration if audit rules are locked (enabled == 2).
// Will result in EPERM.
Expand Down Expand Up @@ -350,10 +374,12 @@ func (ms *MetricSet) initClient() error {
return errors.Wrap(err, "failed to enable auditing in the kernel")
}
}

if err := ms.client.WaitForPendingACKs(); err != nil {
return errors.Wrap(err, "failed to wait for ACKs")
}
if err := ms.client.SetPID(libaudit.WaitForReply); err != nil {

if err := ms.setPID(setPIDMaxRetries); err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EEXIST && status.PID != 0 {
return fmt.Errorf("failed to set audit PID. An audit process is already running (PID %d)", status.PID)
}
Expand All @@ -362,6 +388,20 @@ func (ms *MetricSet) initClient() error {
return nil
}

func (ms *MetricSet) setPID(retries int) (err error) {
if err = ms.client.SetPID(libaudit.WaitForReply); err == nil || errors.Cause(err) != syscall.ENOBUFS || retries == 0 {
return err
}
// At this point the netlink channel is congested (ENOBUFS).
// Drain and close the client, then retry with a new client.
closeAuditClient(ms.client)
if ms.client, err = newAuditClient(&ms.config, ms.log); err != nil {
return errors.Wrapf(err, "failed to recover from ENOBUFS")
}
ms.log.Info("Recovering from ENOBUFS ...")
return ms.setPID(retries - 1)
}

func (ms *MetricSet) updateKernelLostMetric(lost uint32) {
if !ms.kernelLost.enabled {
return
Expand Down

0 comments on commit 8e5336a

Please sign in to comment.