Skip to content

Commit

Permalink
auditd: Fix kernel deadlock after ENOBUFS (#26032)
Browse files Browse the repository at this point in the history
This fixes a deadlock when the netlink channel is congested
(initialization fails with "no buffer space available" / errno=ENOBUFS).

Closes #26031

(cherry picked from commit 3b50a28)
  • Loading branch information
adriansr authored and mergify-bot committed Jun 7, 2021
1 parent c7804cd commit f5e1b28
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 5 deletions.
29 changes: 29 additions & 0 deletions CHANGELOG.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,35 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d
- system/package: Fix an error that can occur while trying to persist package metadata. {issue}18536[18536] {pull}18887[18887]
- system/socket: Fix dataset using 100% CPU and becoming unresponsive in some scenarios. {pull}19033[19033] {pull}19764[19764]
- system/socket: Fixed tracking of long-running connections. {pull}19033[19033]
- system/package: Fix librpm loading on Fedora 31/32. {pull}NNNN[NNNN]
- file_integrity: Create fsnotify watcher only when starting file_integrity module {pull}19505[19505]
- auditd: Fix spelling of anomaly in `event.category`.
- auditd: Fix typo in `event.action` of `removed-user-role-from`. {pull}19300[19300]
- auditd: Fix typo in `event.action` of `used-suspicious-link`. {pull}19300[19300]
- system/socket: Fix kprobe grouping to allow running more than one instance. {pull}20325[20325]
- system/socket: Fixed a crash due to concurrent map read and write. {issue}21192[21192] {pull}21690[21690]
- file_integrity: stop monitoring excluded paths {issue}21278[21278] {pull}21282[21282]
- auditd: Fix an error condition causing a lot of `audit_send_reply` kernel threads being created. {pull}22673[22673]
- system/socket: Fixed start failure when run under config reloader. {issue}20851[20851] {pull}21693[21693]
- system/socket: Having some CPUs unavailable to Auditbeat could cause startup errors or event loss. {pull}22827[22827]
- Note incompatibility of system/socket on ARM. {pull}23381[23381]
- auditd: Fix kernel deadlock when netlink congestion causes "no buffer space available" errors. {issue}26031[26031] {pull}26032[26032]

*Filebeat*

- Fix mapping of fortinet.firewall.mem as integer. {pull}19335[19335]
- Ensure all zeek timestamps include millisecond precision. {issue}14599[14599] {pull}16766[16766]
- Fix s3 input hanging with GetObjectRequest API call by adding context_timeout config. {issue}15502[15502] {pull}15590[15590]
- Add shared_credential_file to cloudtrail config {issue}15652[15652] {pull}15656[15656]
- Fix typos in zeek notice fileset config file. {issue}15764[15764] {pull}15765[15765]
- Fix mapping error when zeek weird logs do not contain IP addresses. {pull}15906[15906]
- Improve `elasticsearch/audit` fileset to handle timestamps correctly. {pull}15942[15942]
- Prevent Elasticsearch from spewing log warnings about redundant wildcards when setting up ingest pipelines for the `elasticsearch` module. {issue}15840[15840] {pull}15900[15900]
- Fix mapping error for cloudtrail additionalEventData field {pull}16088[16088]
- Fix a connection error in httpjson input. {pull}16123[16123]
- Fix integer overflow in S3 offsets when collecting very large files. {pull}22523[22523]
- Fix CredentialsJSON unpacking for `gcp-pubsub` and `httpjson` inputs. {pull}23277[23277]
- Fix issue with m365_defender, when parsing incidents that has no alerts attached: {pull}25421[25421]

*Filebeat*

Expand Down
50 changes: 45 additions & 5 deletions auditbeat/module/auditd/audit_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ const (

lostEventsUpdateInterval = time.Second * 15
maxDefaultStreamBufferConsumers = 4

setPIDMaxRetries = 5
)

type backpressureStrategy uint8
Expand Down Expand Up @@ -137,10 +139,32 @@ func newAuditClient(c *Config, log *logp.Logger) (*libaudit.AuditClient, error)
return libaudit.NewAuditClient(nil)
}

func closeAuditClient(client *libaudit.AuditClient) error {
discard := func(bytes []byte) ([]syscall.NetlinkMessage, error) {
return nil, nil
}
// Drain the netlink channel in parallel to Close() to prevent a deadlock.
// This goroutine will terminate once receive from netlink errors (EBADF,
// EBADFD, or any other error). This happens because the fd is closed.
go func() {
for {
_, err := client.Netlink.Receive(true, discard)
switch err {
case nil, syscall.EINTR:
case syscall.EAGAIN:
time.Sleep(50 * time.Millisecond)
default:
return
}
}
}()
return client.Close()
}

// Run initializes the audit client and receives audit messages from the
// kernel until the reporter's done channel is closed.
func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
defer ms.client.Close()
defer closeAuditClient(ms.client)

if err := ms.addRules(reporter); err != nil {
reporter.Error(err)
Expand All @@ -164,7 +188,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
go func() {
defer func() { // Close the most recently allocated "client" instance.
if client != nil {
client.Close()
closeAuditClient(client)
}
}()
timer := time.NewTicker(lostEventsUpdateInterval)
Expand All @@ -178,7 +202,7 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) {
ms.updateKernelLostMetric(status.Lost)
} else {
ms.log.Error("get status request failed:", err)
if err = client.Close(); err != nil {
if err = closeAuditClient(client); err != nil {
ms.log.Errorw("Error closing audit monitoring client", "error", err)
}
client, err = libaudit.NewAuditClient(nil)
Expand Down Expand Up @@ -233,7 +257,7 @@ func (ms *MetricSet) addRules(reporter mb.PushReporterV2) error {
if err != nil {
return errors.Wrap(err, "failed to create audit client for adding rules")
}
defer client.Close()
defer closeAuditClient(client)

// Don't attempt to change configuration if audit rules are locked (enabled == 2).
// Will result in EPERM.
Expand Down Expand Up @@ -350,10 +374,12 @@ func (ms *MetricSet) initClient() error {
return errors.Wrap(err, "failed to enable auditing in the kernel")
}
}

if err := ms.client.WaitForPendingACKs(); err != nil {
return errors.Wrap(err, "failed to wait for ACKs")
}
if err := ms.client.SetPID(libaudit.WaitForReply); err != nil {

if err := ms.setPID(setPIDMaxRetries); err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.EEXIST && status.PID != 0 {
return fmt.Errorf("failed to set audit PID. An audit process is already running (PID %d)", status.PID)
}
Expand All @@ -362,6 +388,20 @@ func (ms *MetricSet) initClient() error {
return nil
}

func (ms *MetricSet) setPID(retries int) (err error) {
if err = ms.client.SetPID(libaudit.WaitForReply); err == nil || errors.Cause(err) != syscall.ENOBUFS || retries == 0 {
return err
}
// At this point the netlink channel is congested (ENOBUFS).
// Drain and close the client, then retry with a new client.
closeAuditClient(ms.client)
if ms.client, err = newAuditClient(&ms.config, ms.log); err != nil {
return errors.Wrapf(err, "failed to recover from ENOBUFS")
}
ms.log.Info("Recovering from ENOBUFS ...")
return ms.setPID(retries - 1)
}

func (ms *MetricSet) updateKernelLostMetric(lost uint32) {
if !ms.kernelLost.enabled {
return
Expand Down

0 comments on commit f5e1b28

Please sign in to comment.