hyperlight-dev · simongdavies · Oct 28, 2025 · Oct 16, 2025
@@ -168,7 +168,9 @@ test-integration guest target=default-target features="":
     {{if os() == "windows" { "$env:" } else { "" } }}GUEST="{{guest}}"{{if os() == "windows" { ";" } else { "" } }} {{ cargo-cmd }} test --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test integration_test execute_on_heap {{ if features =="" {""} else {"--features " + features} }} -- --ignored
 
     @# run the rest of the integration tests
-    {{if os() == "windows" { "$env:" } else { "" } }}GUEST="{{guest}}"{{if os() == "windows" { ";" } else { "" } }} {{ cargo-cmd }} test -p hyperlight-host {{ if features =="" {''} else if features=="no-default-features" {"--no-default-features" } else {"--no-default-features -F init-paging," + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test '*'
+    @# skip interrupt_random_kill_stress_test and then run it explicitly so we can see the output more 
+    {{if os() == "windows" { "$env:" } else { "" } }}GUEST="{{guest}}"{{if os() == "windows" { ";" } else { "" } }} {{ cargo-cmd }} test -p hyperlight-host {{ if features =="" {''} else if features=="no-default-features" {"--no-default-features" } else {"--no-default-features -F init-paging," + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test '*' --  --skip interrupt_random_kill_stress_test
+    {{if os() == "windows" { "$env:" } else { "" } }}GUEST="{{guest}}"{{if os() == "windows" { ";" } else { "" } }} {{ cargo-cmd }} test -p hyperlight-host {{ if features =="" {''} else if features=="no-default-features" {"--no-default-features" } else {"--no-default-features -F init-paging," + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test integration_test interrupt_random_kill_stress_test --  --nocapture --exact
 
 # tests compilation with no default features on different platforms
 test-compilation-no-default-features target=default-target:

@@ -390,7 +390,8 @@ impl HypervLinuxDriver {
 
         let interrupt_handle = Arc::new(LinuxInterruptHandle {
             running: AtomicU64::new(0),
-            cancel_requested: AtomicBool::new(false),
+            cancel_requested: AtomicU64::new(0),
+            call_active: AtomicBool::new(false),
             #[cfg(gdb)]
             debug_interrupt: AtomicBool::new(false),
             #[cfg(all(
@@ -658,17 +659,14 @@ impl Hypervisor for HypervLinuxDriver {
 
         self.interrupt_handle
             .tid
-            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
-        self.interrupt_handle
-            .set_running_and_increment_generation()
-            .map_err(|e| {
-                new_error!(
-                    "Error setting running state and incrementing generation: {}",
-                    e
-                )
-            })?;
+            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after set_running_bit but before checking cancel_requested):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will check cancel_requested below and skip the VcpuFd::run() call
+        // - This is the desired behavior - the kill takes effect immediately
+        let generation = self.interrupt_handle.set_running_bit();
+
         #[cfg(not(gdb))]
         let debug_interrupt = false;
         #[cfg(gdb)]
@@ -677,14 +675,16 @@ impl Hypervisor for HypervLinuxDriver {
             .debug_interrupt
             .load(Ordering::Relaxed);
 
-        // Don't run the vcpu if `cancel_requested` is true
+        // Don't run the vcpu if `cancel_requested` is set for our generation
         //
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after checking cancel_requested but before vcpu.run()):
+        // - kill() will stamp cancel_requested with the current generation
+        // - We will proceed with vcpu.run(), but signals will be sent to interrupt it
+        // - The vcpu will be interrupted and return EINTR (handled below)
         let exit_reason = if self
             .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed)
+            .is_cancel_requested_for_generation(generation)
             || debug_interrupt
         {
             Err(mshv_ioctls::MshvError::from(libc::EINTR))
@@ -705,27 +705,32 @@ impl Hypervisor for HypervLinuxDriver {
             #[cfg(mshv3)]
             self.vcpu_fd.run()
         };
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after vcpu.run() returns but before clear_running_bit):
+        // - kill() continues sending signals to this thread (running bit is still set)
+        // - The signals are harmless (no-op handler), we just need to check cancel_requested
+        // - We load cancel_requested below to determine if this run was cancelled
         let cancel_requested = self
             .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed);
+            .is_cancel_requested_for_generation(generation);
         #[cfg(gdb)]
         let debug_interrupt = self
             .interrupt_handle
             .debug_interrupt
             .load(Ordering::Relaxed);
-        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
-        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
-        // Additionally signals will be sent to this thread until `running` is set to false.
-        // This is fine since the signal handler is a no-op.
+        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
+        // (after loading cancel_requested but before clear_running_bit):
+        // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded)
+        // - kill() continues sending signals until running bit is cleared
+        // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call
+        // - Signals sent now are harmless (no-op handler)
         self.interrupt_handle.clear_running_bit();
-        // At this point, `running` is false so no more signals will be sent to this thread,
-        // but we may still receive async signals that were sent before this point.
-        // To prevent those signals from interrupting subsequent calls to `run()`,
-        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
+        // At this point, running bit is clear so kill() will stop sending signals.
+        // However, we may still receive delayed signals that were sent before clear_running_bit.
+        // These stale signals are harmless because:
+        // - The signal handler is a no-op
+        // - We check generation matching in cancel_requested before treating EINTR as cancellation
+        // - If generation doesn't match, we return Retry instead of Cancelled
         let result = match exit_reason {
             Ok(m) => match m.header.message_type {
                 HALT_MESSAGE => {
@@ -805,14 +810,16 @@ impl Hypervisor for HypervLinuxDriver {
                 }
             },
             Err(e) => match e.errno() {
-                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
+                // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR
                 libc::EINTR => {
-                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
-                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
+                    // Check if cancellation was requested for THIS specific generation.
+                    // If not, the EINTR came from:
+                    // - A debug interrupt (if GDB is enabled)
+                    // - A stale signal from a previous guest call (generation mismatch)
+                    // - A signal meant for a different sandbox on the same thread
+                    // In these cases, we return Retry to continue execution.
                     if cancel_requested {
-                        self.interrupt_handle
-                            .cancel_requested
-                            .store(false, Ordering::Relaxed);
+                        self.interrupt_handle.clear_cancel_requested();
                         HyperlightExit::Cancelled()
                     } else {
                         #[cfg(gdb)]

@@ -17,7 +17,7 @@ limitations under the License.
 use std::fmt;
 use std::fmt::{Debug, Formatter};
 use std::string::String;
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 
 use log::LevelFilter;
@@ -327,10 +327,11 @@ impl HypervWindowsDriver {
         };
 
         let interrupt_handle = Arc::new(WindowsInterruptHandle {
-            running: AtomicBool::new(false),
-            cancel_requested: AtomicBool::new(false),
+            running: AtomicU64::new(0),
+            cancel_requested: AtomicU64::new(0),
             #[cfg(gdb)]
             debug_interrupt: AtomicBool::new(false),
+            call_active: AtomicBool::new(false),
             partition_handle,
             dropped: AtomicBool::new(false),
         });
@@ -549,7 +550,8 @@ impl Hypervisor for HypervWindowsDriver {
         &mut self,
         #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext,
     ) -> Result<super::HyperlightExit> {
-        self.interrupt_handle.running.store(true, Ordering::Relaxed);
+        // Get current generation and set running bit
+        let generation = self.interrupt_handle.set_running_bit();
 
         #[cfg(not(gdb))]
         let debug_interrupt = false;
@@ -559,11 +561,10 @@ impl Hypervisor for HypervWindowsDriver {
             .debug_interrupt
             .load(Ordering::Relaxed);
 
-        // Don't run the vcpu if `cancel_requested` is true
+        // Check if cancellation was requested for THIS generation
         let exit_context = if self
             .interrupt_handle
-            .cancel_requested
-            .load(Ordering::Relaxed)
+            .is_cancel_requested_for_generation(generation)
             || debug_interrupt
         {
             WHV_RUN_VP_EXIT_CONTEXT {
@@ -578,12 +579,21 @@ impl Hypervisor for HypervWindowsDriver {
 
             self.processor.run()?
         };
-        self.interrupt_handle
-            .cancel_requested
-            .store(false, Ordering::Relaxed);
-        self.interrupt_handle
-            .running
-            .store(false, Ordering::Relaxed);
+
+        // Clear running bit
+        self.interrupt_handle.clear_running_bit();
+
+        let is_canceled = exit_context.ExitReason == WHV_RUN_VP_EXIT_REASON(8193i32); // WHvRunVpExitReasonCanceled
+
+        // Check if this was a manual cancellation (vs internal Windows cancellation)
+        let cancel_was_requested_manually = self
+            .interrupt_handle
+            .is_cancel_requested_for_generation(generation);
+
+        // Only clear cancel_requested if we're actually processing a cancellation for this generation
+        if is_canceled && cancel_was_requested_manually {
+            self.interrupt_handle.clear_cancel_requested();
+        }
 
         #[cfg(gdb)]
         let debug_interrupt = self
@@ -659,12 +669,32 @@ impl Hypervisor for HypervWindowsDriver {
                     // return a special exit reason so that the gdb thread can handle it
                     // and resume execution
                     HyperlightExit::Debug(VcpuStopReason::Interrupt)
+                } else if !cancel_was_requested_manually {
+                    // This was an internal cancellation
+                    // The virtualization stack can use this function to return the control
+                    // of a virtual processor back to the virtualization stack in case it
+                    // needs to change the state of a VM or to inject an event into the processor
+                    // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
+                    debug!("Internal cancellation detected, returning Retry error");
+                    HyperlightExit::Retry()
                 } else {
                     HyperlightExit::Cancelled()
                 }
 
                 #[cfg(not(gdb))]
-                HyperlightExit::Cancelled()
+                {
+                    if !cancel_was_requested_manually {
+                        // This was an internal cancellation
+                        // The virtualization stack can use this function to return the control
+                        // of a virtual processor back to the virtualization stack in case it
+                        // needs to change the state of a VM or to inject an event into the processor
+                        // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks
+                        debug!("Internal cancellation detected, returning Retry error");
+                        HyperlightExit::Retry()
+                    } else {
+                        HyperlightExit::Cancelled()
+                    }
+                }
             }
             #[cfg(gdb)]
             WHV_RUN_VP_EXIT_REASON(4098i32) => {
@@ -964,30 +994,77 @@ impl Drop for HypervWindowsDriver {
 
 #[derive(Debug)]
 pub struct WindowsInterruptHandle {
-    // `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running, which is the reason we need this flag.
-    running: AtomicBool,
-    cancel_requested: AtomicBool,
+    /// Combined running flag (bit 63) and generation counter (bits 0-62).
+    ///
+    /// The generation increments with each guest function call to prevent
+    /// stale cancellations from affecting new calls (ABA problem).
+    ///
+    /// Layout: `[running:1 bit][generation:63 bits]`
+    running: AtomicU64,
+
+    /// Combined cancel_requested flag (bit 63) and generation counter (bits 0-62).
+    ///
+    /// When kill() is called, this stores the current generation along with
+    /// the cancellation flag. The VCPU only honors the cancellation if the
+    /// generation matches its current generation.
+    ///
+    /// Layout: `[cancel_requested:1 bit][generation:63 bits]`
+    cancel_requested: AtomicU64,
+
     // This is used to signal the GDB thread to stop the vCPU
     #[cfg(gdb)]
     debug_interrupt: AtomicBool,
+    /// Flag indicating whether a guest function call is currently in progress.
+    ///
+    /// **true**: A guest function call is active (between call start and completion)
+    /// **false**: No guest function call is active
+    ///
+    /// # Purpose
+    ///
+    /// This flag prevents kill() from having any effect when called outside of a
+    /// guest function call. This solves the "kill-in-advance" problem where kill()
+    /// could be called before a guest function starts and would incorrectly cancel it.
+    call_active: AtomicBool,
     partition_handle: WHV_PARTITION_HANDLE,
     dropped: AtomicBool,
 }
 
 impl InterruptHandle for WindowsInterruptHandle {
     fn kill(&self) -> bool {
-        self.cancel_requested.store(true, Ordering::Relaxed);
-        self.running.load(Ordering::Relaxed)
-            && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
+        // Check if a call is actually active first
+        if !self.call_active.load(Ordering::Acquire) {
+            return false;
+        }
+
+        // Get the current running state and generation
+        let (running, generation) = self.get_running_and_generation();
+
+        // Set cancel_requested with the current generation
+        self.set_cancel_requested(generation);
+
+        // Only call WHvCancelRunVirtualProcessor if VCPU is actually running in guest mode
+        running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
     }
     #[cfg(gdb)]
     fn kill_from_debugger(&self) -> bool {
         self.debug_interrupt.store(true, Ordering::Relaxed);
-        self.running.load(Ordering::Relaxed)
-            && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
+        let (running, _) = self.get_running_and_generation();
+        running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
+    }
+
+    fn get_call_active(&self) -> &AtomicBool {
+        &self.call_active
+    }
+
+    fn get_dropped(&self) -> &AtomicBool {
+        &self.dropped
+    }
+
+    fn get_running(&self) -> &AtomicU64 {
+        &self.running
     }
 
-    fn dropped(&self) -> bool {
-        self.dropped.load(Ordering::Relaxed)
+    fn get_cancel_requested(&self) -> &AtomicU64 {
+        &self.cancel_requested
     }
 }