diff --git a/CHANGELOG.md b/CHANGELOG.md index f7f50678de4..c29a94cbdd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,15 @@ and this project adheres to `VIRTIO_NET_F_RX_MRGBUF` support to the `virtio-net` device. When this feature is negotiated, guest `virtio-net` driver can perform more efficient memory management which in turn improves RX and TX performance. +- [#4460](https://github.com/firecracker-microvm/firecracker/pull/4460): Add a + call to + [`KVM_KVMCLOCK_CTRL`](https://docs.kernel.org/virt/kvm/api.html#kvm-kvmclock-ctrl) + after pausing vCPUs on x86_64 architectures. This ioctl sets a flag in the KVM + state of the vCPU indicating that it has been paused by the host userspace. In + guests that use kvmclock, the soft lockup watchdog checks this flag. If it is + set, it won't trigger the lockup condition. Calling the ioctl for guests that + don't use kvmclock will fail. These failures are not fatal. We log the failure + and increase the `vcpu.kvmclock_ctrl_fails` metric. ### Changed diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 75c9afa02f0..861b69c6b44 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1254,6 +1254,18 @@ } ] }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44717, + "comment": "KVM_KVMCLOCK_CTRL. We call this after pausing vCPUs to avoid soft lockups in the guest." + } + ] + }, { "syscall": "sched_yield", "comment": "Used by the rust standard library in std::sync::mpmc. Firecracker uses mpsc channels from this module for inter-thread communication" diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index 083881645a5..bcde569115e 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -779,6 +779,8 @@ pub struct VcpuMetrics { pub exit_mmio_write: SharedIncMetric, /// Number of errors during this VCPU's run. pub failures: SharedIncMetric, + /// Number of times that the `KVM_KVMCLOCK_CTRL` ioctl failed. + pub kvmclock_ctrl_fails: SharedIncMetric, /// Provides Min/max/sum for KVM exits handling input IO. pub exit_io_in_agg: LatencyAggregateMetrics, /// Provides Min/max/sum for KVM exits handling output IO. @@ -797,6 +799,7 @@ impl VcpuMetrics { exit_mmio_read: SharedIncMetric::new(), exit_mmio_write: SharedIncMetric::new(), failures: SharedIncMetric::new(), + kvmclock_ctrl_fails: SharedIncMetric::new(), exit_io_in_agg: LatencyAggregateMetrics::new(), exit_io_out_agg: LatencyAggregateMetrics::new(), exit_mmio_read_agg: LatencyAggregateMetrics::new(), diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index 43a0946931e..cb63afa4579 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -336,8 +336,15 @@ impl Vcpu { .send(VcpuResponse::Paused) .expect("vcpu channel unexpectedly closed"); - // TODO: we should call `KVM_KVMCLOCK_CTRL` here to make sure - // TODO continued: the guest soft lockup watchdog does not panic on Resume. + // Calling `KVM_KVMCLOCK_CTRL` to make sure the guest softlockup watchdog + // does not panic on resume, see https://docs.kernel.org/virt/kvm/api.html . + // We do not want to fail if the call is not successful, because depending + // that may be acceptable depending on the workload. + #[cfg(target_arch = "x86_64")] + if let Err(err) = self.kvm_vcpu.fd.kvmclock_ctrl() { + METRICS.vcpu.kvmclock_ctrl_fails.inc(); + warn!("KVM_KVMCLOCK_CTRL call failed {}", err); + } // Move to 'paused' state. state = StateMachine::next(Self::paused); diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 88e520fa5f3..dbae40016ea 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -1,6 +1,7 @@ # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Generic utility functions that are used in the framework.""" +import errno import functools import json import logging @@ -453,7 +454,9 @@ def get_process_pidfd(pid): """Get a pidfd file descriptor for the process with PID `pid` Will return a pid file descriptor for the process with PID `pid` if it is - still alive. If the process has already exited it will return `None`. + still alive. If the process has already exited we will receive either a + `ProcessLookupError` exception or and an `OSError` exception with errno `EINVAL`. + In these cases, we will return `None`. Any other error while calling the system call, will raise an OSError exception. @@ -462,6 +465,11 @@ def get_process_pidfd(pid): pidfd = os.pidfd_open(pid) except ProcessLookupError: return None + except OSError as err: + if err.errno == errno.EINVAL: + return None + + raise return pidfd diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index 457b3c65aa2..e33e89089dc 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -234,6 +234,7 @@ def validate_fc_metrics(metrics): "exit_mmio_read", "exit_mmio_write", "failures", + "kvmclock_ctrl_fails", {"exit_io_in_agg": latency_agg_metrics_fields}, {"exit_io_out_agg": latency_agg_metrics_fields}, {"exit_mmio_read_agg": latency_agg_metrics_fields}, diff --git a/tests/integration_tests/functional/test_pause_resume.py b/tests/integration_tests/functional/test_pause_resume.py index 4210e3a0d7f..3d0ac124c11 100644 --- a/tests/integration_tests/functional/test_pause_resume.py +++ b/tests/integration_tests/functional/test_pause_resume.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 """Basic tests scenarios for snapshot save/restore.""" +import platform +import time + import pytest @@ -127,3 +130,38 @@ def test_pause_resume_preboot(uvm_nano): # Try to resume microvm when not running, it must fail. with pytest.raises(RuntimeError, match=expected_err): basevm.api.vm.patch(state="Resumed") + + +@pytest.mark.skipif( + platform.machine() != "x86_64", reason="Only x86_64 supports pvclocks." +) +def test_kvmclock_ctrl(uvm_plain_any): + """ + Test that pausing vCPUs does not trigger a soft lock-up + """ + + microvm = uvm_plain_any + microvm.help.enable_console() + microvm.spawn() + microvm.basic_config() + microvm.add_net_iface() + microvm.start() + + # Launch reproducer in host + # This launches `ls -R /` in a loop inside the guest. The command writes its output in the + # console. This detail is important as it writing in the console seems to increase the probability + # that we will pause the execution inside the kernel and cause a lock up. Setting KVM_CLOCK_CTRL + # bit that informs the guest we're pausing the vCPUs, should avoid that lock up. + microvm.ssh.check_output( + "timeout 60 sh -c 'while true; do ls -R /; done' > /dev/ttyS0 2>&1 < /dev/null &" + ) + + for _ in range(12): + microvm.api.vm.patch(state="Paused") + time.sleep(5) + microvm.api.vm.patch(state="Resumed") + + dmesg = microvm.ssh.check_output("dmesg").stdout + assert "rcu_sched self-detected stall on CPU" not in dmesg + assert "rcu_preempt detected stalls on CPUs/tasks" not in dmesg + assert "BUG: soft lockup -" not in dmesg