From 11da9c04aea6035dca305588628e52a8d673e135 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 19 Dec 2024 12:00:12 -0800 Subject: [PATCH] gimlet-seq: Record why the power state changed (#1953) Currently, when the Gimlet CPU sequencer changes the system's power state, no information about *why* the power state changed is recorded by the SP firmware. A system may power off or reboot for a variety of reasons: it may be requested by the host OS over IPCC, by the control plane over the management network, or triggered by the thermal task due to an overheat condition. This makes debugging an unexpected reboot or power off difficult, as the SP ringbuffers and other diagnostics do not indicate why an unexpected power state change occurred. See #1950 for a motivating example. This commit resolves this as described in #1950 by adding a new field to the `SetState` variant in the `drv-gimlet-seq-server` ringbuffer, so that the reason a power state change occurred can be recorded. A new IPC function, `Sequencer.set_state_with_reason`, is added to the `cpu_seq` IPC API. This is equivalent to `Sequencer.set_state` but with the addition of a `StateChangeReason` argument in addition to the desired power state, and the sequencer task will record the provided reason in its ringbuffer. This way, we can distinguish between the various reasons a power state change may have occurred when debugging such issues. All Hubris-internal callers of `Sequencer.set_state` are updated to instead use `Sequencer.set_state_with_reason`. In particular, `host-sp-comms` will record a variety of different `StateChangeReason`s, allowing us to indicate whether the host requested a normal power-off/reboot, the host OS panicked or failed to boot, or the host CPU reset itself. Other callers like `control-plane-agent` and `thermal` are simpler and just say "it was the control plane" or "overheat", respectively. For backwards compatibility with existing callers of `Sequencer.set_state` via `hiffy`, the `set_state` IPC is left as-is, and will be recorded in the ringbuffer with `StateChangeReason::Other`. Since all Hubris tasks now use the new API, `Other` *basically* just means `hiffy`. The `StateChangeReason` enum also generates counters, so that the total number of power state changes can be tracked. Also, while I was here, I've changed the `Trace::SetState` entry in the `drv-gimlet-seq-server` ringbuf from a tuple-like enum variant to a struct-like enum variant with named fields. This entry includes two `PowerState` fields, one recording the previous power state and the other recording the new power state that has been set. IMHO, using a tuple-like variant to represent this is a bit unfortunate, as in Humility, we'll see two values of the same type and it's not immediately obvious which is the previous state and which is the new state. This must be determined based on the order of the fields in the ringbuf entry, which requires referencing the Hubris code to determine. I felt like it was nicer to just use a struct-like variant with named fields for this. That way, the semantic meaning of the two `PowerState`s is actually encoded in the debug info, and Humility can just indicate which is the previous state and which is the new state when displaying the ring buffer. I also think it's a bit nicer to name the timestamp field --- otherwise, it just looks like some arbitrary integer, and you need to look at the code to determine that it's the timestamp of the power state change. Fixes #1950 --- drv/cpu-seq-api/src/lib.rs | 31 +++++++++ drv/gimlet-seq-server/src/main.rs | 39 +++++++++-- drv/grapefruit-seq-server/src/main.rs | 44 +++++++++---- drv/mock-gimlet-seq-server/src/main.rs | 38 +++++++---- idl/cpu-seq.idol | 19 +++++- .../src/mgs_compute_sled.rs | 5 +- task/host-sp-comms/src/main.rs | 65 +++++++++++++++---- task/thermal/src/bsp/gimlet_bcdef.rs | 5 +- 8 files changed, 196 insertions(+), 50 deletions(-) diff --git a/drv/cpu-seq-api/src/lib.rs b/drv/cpu-seq-api/src/lib.rs index 8ab006e09..4dc3ba20b 100644 --- a/drv/cpu-seq-api/src/lib.rs +++ b/drv/cpu-seq-api/src/lib.rs @@ -9,6 +9,7 @@ use counters::Count; use derive_idol_err::IdolError; use userlib::{sys_send, FromPrimitive}; +use zerocopy::AsBytes; // Re-export PowerState for client convenience. pub use drv_cpu_power_state::PowerState; @@ -31,6 +32,36 @@ pub enum SeqError { ServerRestarted, } +#[derive(Copy, Clone, Debug, FromPrimitive, Eq, PartialEq, AsBytes, Count)] +#[repr(u8)] +pub enum StateChangeReason { + /// No reason was provided. + /// + /// This indicates a legacy caller of `Sequencer.set_state`, rather than + /// `Sequencer.set_state_with_reason`. All Hubris-internal callers should + /// use `set_state_with_reason`, so this variant generally indicates that + /// the `Sequencer.set_state` IPC is being called via Hiffy. + Other = 1, + /// The system has just received power, so the sequencer has booted the + /// host CPU. + InitialPowerOn, + /// A power state change was requested by the control plane. + ControlPlane, + /// The host CPU reset while in A0, so the system has powered off to clear + /// hidden core state. + CpuReset, + /// The host OS failed to boot, so the system has powered off. + HostBootFailure, + /// The host OS panicked. + HostPanic, + /// The host OS requested that the system power off without rebooting. + HostPowerOff, + /// The host OS requested that the system reboot. + HostReboot, + /// The system powered off because a component has overheated. + Overheat, +} + // On Gimlet, we have two banks of up to 8 DIMMs apiece. Export the "two banks" // bit of knowledge here so it can be used by gimlet-seq-server, spd, and // packrat, all of which want to know at compile-time how many banks there are. diff --git a/drv/gimlet-seq-server/src/main.rs b/drv/gimlet-seq-server/src/main.rs index 0c3776e96..2e66c25b5 100644 --- a/drv/gimlet-seq-server/src/main.rs +++ b/drv/gimlet-seq-server/src/main.rs @@ -17,7 +17,7 @@ use userlib::{ sys_set_timer, task_slot, units, RecvMessage, TaskId, UnwrapLite, }; -use drv_cpu_seq_api::{PowerState, SeqError}; +use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason}; use drv_hf_api as hf_api; use drv_i2c_api as i2c; use drv_ice40_spi_program as ice40; @@ -90,7 +90,13 @@ enum Trace { RailsOn, UartEnabled, A0(u16), - SetState(PowerState, PowerState, u64), + SetState { + prev: PowerState, + next: PowerState, + #[count(children)] + why: StateChangeReason, + now: u64, + }, UpdateState(#[count(children)] PowerState), ClockConfigWrite, ClockConfigSuccess, @@ -482,7 +488,10 @@ impl ServerImpl { // Power on, unless suppressed by the `stay-in-a2` feature if !cfg!(feature = "stay-in-a2") { - _ = server.set_state_internal(PowerState::A0); + _ = server.set_state_internal( + PowerState::A0, + StateChangeReason::InitialPowerOn, + ); } // @@ -666,11 +675,17 @@ impl ServerImpl { fn set_state_internal( &mut self, state: PowerState, + why: StateChangeReason, ) -> Result<(), SeqError> { let sys = sys_api::Sys::from(SYS.get_task_id()); let now = sys_get_timer().now; - ringbuf_entry!(Trace::SetState(self.state, state, now)); + ringbuf_entry!(Trace::SetState { + prev: self.state, + next: state, + why, + now + }); ringbuf_entry_v3p3_sys_a0_vout(); @@ -813,6 +828,7 @@ impl ServerImpl { // // Now wait for the end of Group C. + // loop { let mut status = [0u8]; @@ -1033,7 +1049,18 @@ impl idl::InOrderSequencerImpl for ServerImpl { _: &RecvMessage, state: PowerState, ) -> Result<(), RequestError> { - self.set_state_internal(state).map_err(RequestError::from) + self.set_state_internal(state, StateChangeReason::Other) + .map_err(RequestError::from) + } + + fn set_state_with_reason( + &mut self, + _: &RecvMessage, + state: PowerState, + reason: StateChangeReason, + ) -> Result<(), RequestError> { + self.set_state_internal(state, reason) + .map_err(RequestError::from) } fn send_hardware_nmi( @@ -1403,7 +1430,7 @@ cfg_if::cfg_if! { } mod idl { - use super::SeqError; + use super::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } diff --git a/drv/grapefruit-seq-server/src/main.rs b/drv/grapefruit-seq-server/src/main.rs index 176ac1570..5787e7cec 100644 --- a/drv/grapefruit-seq-server/src/main.rs +++ b/drv/grapefruit-seq-server/src/main.rs @@ -7,7 +7,7 @@ #![no_std] #![no_main] -use drv_cpu_seq_api::PowerState; +use drv_cpu_seq_api::{PowerState, StateChangeReason}; use drv_spi_api::{SpiDevice, SpiServer}; use drv_stm32xx_sys_api as sys_api; use idol_runtime::{NotificationHandler, RequestError}; @@ -29,6 +29,7 @@ enum Trace { ContinueBitstreamLoad(usize), WaitForDone, Programmed, + #[count(skip)] None, } @@ -268,6 +269,20 @@ impl ServerImpl { fn set_state_impl(&self, state: PowerState) { self.jefe.set_state(state as u32); } + + fn validate_state_change( + &self, + state: PowerState, + ) -> Result<(), drv_cpu_seq_api::SeqError> { + match (self.get_state_impl(), state) { + (PowerState::A2, PowerState::A0) + | (PowerState::A0, PowerState::A2) + | (PowerState::A0PlusHP, PowerState::A2) + | (PowerState::A0Thermtrip, PowerState::A2) => Ok(()), + + _ => Err(drv_cpu_seq_api::SeqError::IllegalTransition), + } + } } // The `Sequencer` implementation for Grapefruit is copied from @@ -286,19 +301,20 @@ impl idl::InOrderSequencerImpl for ServerImpl { _: &RecvMessage, state: PowerState, ) -> Result<(), RequestError> { - match (self.get_state_impl(), state) { - (PowerState::A2, PowerState::A0) - | (PowerState::A0, PowerState::A2) - | (PowerState::A0PlusHP, PowerState::A2) - | (PowerState::A0Thermtrip, PowerState::A2) => { - self.set_state_impl(state); - Ok(()) - } + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) + } - _ => Err(RequestError::Runtime( - drv_cpu_seq_api::SeqError::IllegalTransition, - )), - } + fn set_state_with_reason( + &mut self, + _: &RecvMessage, + state: PowerState, + _: StateChangeReason, + ) -> Result<(), RequestError> { + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn send_hardware_nmi( @@ -327,7 +343,7 @@ impl NotificationHandler for ServerImpl { } mod idl { - use drv_cpu_seq_api::SeqError; + use drv_cpu_seq_api::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } diff --git a/drv/mock-gimlet-seq-server/src/main.rs b/drv/mock-gimlet-seq-server/src/main.rs index 64b0733bd..46d9f48c4 100644 --- a/drv/mock-gimlet-seq-server/src/main.rs +++ b/drv/mock-gimlet-seq-server/src/main.rs @@ -7,7 +7,7 @@ #![no_std] #![no_main] -use drv_cpu_seq_api::{PowerState, SeqError}; +use drv_cpu_seq_api::{PowerState, SeqError, StateChangeReason}; use idol_runtime::{NotificationHandler, RequestError}; use task_jefe_api::Jefe; use userlib::{FromPrimitive, RecvMessage, UnwrapLite}; @@ -44,6 +44,17 @@ impl ServerImpl { fn set_state_impl(&self, state: PowerState) { self.jefe.set_state(state as u32); } + + fn validate_state_change(&self, state: PowerState) -> Result<(), SeqError> { + match (self.get_state_impl(), state) { + (PowerState::A2, PowerState::A0) + | (PowerState::A0, PowerState::A2) + | (PowerState::A0PlusHP, PowerState::A2) + | (PowerState::A0Thermtrip, PowerState::A2) => Ok(()), + + _ => Err(SeqError::IllegalTransition), + } + } } impl idl::InOrderSequencerImpl for ServerImpl { @@ -59,17 +70,20 @@ impl idl::InOrderSequencerImpl for ServerImpl { _: &RecvMessage, state: PowerState, ) -> Result<(), RequestError> { - match (self.get_state_impl(), state) { - (PowerState::A2, PowerState::A0) - | (PowerState::A0, PowerState::A2) - | (PowerState::A0PlusHP, PowerState::A2) - | (PowerState::A0Thermtrip, PowerState::A2) => { - self.set_state_impl(state); - Ok(()) - } + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) + } - _ => Err(RequestError::Runtime(SeqError::IllegalTransition)), - } + fn set_state_with_reason( + &mut self, + _: &RecvMessage, + state: PowerState, + _: StateChangeReason, + ) -> Result<(), RequestError> { + self.validate_state_change(state)?; + self.set_state_impl(state); + Ok(()) } fn send_hardware_nmi( @@ -99,7 +113,7 @@ impl NotificationHandler for ServerImpl { } mod idl { - use super::SeqError; + use super::{SeqError, StateChangeReason}; include!(concat!(env!("OUT_DIR"), "/server_stub.rs")); } diff --git a/idl/cpu-seq.idol b/idl/cpu-seq.idol index f345f36ed..12015daa7 100644 --- a/idl/cpu-seq.idol +++ b/idl/cpu-seq.idol @@ -13,12 +13,29 @@ Interface( idempotent: true, ), "set_state": ( + doc: "Set the power state without providing a reason (legacy).", + args: { + "state": ( + type: "drv_cpu_power_state::PowerState", + recv: FromPrimitive("u8"), + ), + }, + reply: Result( + ok: "()", + err: CLike("SeqError"), + ), + ), + "set_state_with_reason": ( doc: "Set the power state", args: { "state": ( type: "drv_cpu_power_state::PowerState", recv: FromPrimitive("u8"), - ) + ), + "reason": ( + type: "StateChangeReason", + recv: FromPrimitive("u8"), + ), }, reply: Result( ok: "()", diff --git a/task/control-plane-agent/src/mgs_compute_sled.rs b/task/control-plane-agent/src/mgs_compute_sled.rs index d191cefcd..54c46855b 100644 --- a/task/control-plane-agent/src/mgs_compute_sled.rs +++ b/task/control-plane-agent/src/mgs_compute_sled.rs @@ -728,7 +728,10 @@ impl SpHandler for MgsHandler { }; self.sequencer - .set_state(power_state) + .set_state_with_reason( + power_state, + drv_cpu_seq_api::StateChangeReason::ControlPlane, + ) .map_err(|e| SpError::PowerStateError(e as u32)) } diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index f0d39438c..70c772956 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -12,7 +12,7 @@ use attest_data::messages::{ HostToRotCommand, RecvSprotError as AttestDataSprotError, RotToHost, MAX_DATA_LEN, }; -use drv_cpu_seq_api::{PowerState, SeqError, Sequencer}; +use drv_cpu_seq_api::{PowerState, SeqError, Sequencer, StateChangeReason}; use drv_hf_api::{HfDevSelect, HfMuxState, HostFlash}; use drv_sprot_api::SpRot; use drv_stm32xx_sys_api as sys_api; @@ -106,6 +106,7 @@ enum Trace { now: u64, #[count(children)] state: PowerState, + why: StateChangeReason, }, HfMux { now: u64, @@ -250,6 +251,12 @@ struct ServerImpl { reboot_state: Option, host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, + /// Set when the host OS fails to boot or panics, and unset when the system + /// reboots. + /// + /// This is used to determine whether a host-triggered power-off is due to a + /// kernel panic, boot failure, or was a normal power-off. + last_power_off: Option, } impl ServerImpl { @@ -317,6 +324,7 @@ impl ServerImpl { dtrace_conf_len: 0, }, hf_mux_state: None, + last_power_off: None, } } @@ -374,23 +382,31 @@ impl ServerImpl { // basically only ever succeed in our initial set_state() request, so I // don't know how we'd test it fn power_off_host(&mut self, reboot: bool) { + let why = match self.last_power_off { + None if reboot => StateChangeReason::HostReboot, + None => StateChangeReason::HostPowerOff, + Some(reason) => reason, + }; loop { // Attempt to move to A2; given we only call this function in // response to a host request, we expect we're currently in A0 and // this should work. - let err = match self.sequencer.set_state(PowerState::A2) { - Ok(()) => { - ringbuf_entry!(Trace::SetState { - now: sys_get_timer().now, - state: PowerState::A2, - }); - if reboot { - self.reboot_state = Some(RebootState::WaitingForA2); + let err = + match self.sequencer.set_state_with_reason(PowerState::A2, why) + { + Ok(()) => { + ringbuf_entry!(Trace::SetState { + now: sys_get_timer().now, + why, + state: PowerState::A2, + }); + if reboot { + self.reboot_state = Some(RebootState::WaitingForA2); + } + return; } - return; - } - Err(err) => err, - }; + Err(err) => err, + }; // The only error we should see from `set_state()` is an illegal // transition, if we're not currently in A0. @@ -463,10 +479,14 @@ impl ServerImpl { // we cannot let the SoC simply reset because the true state // of hidden cores is unknown: explicitly bounce to A2 // as if the host had requested it. + self.last_power_off = Some(StateChangeReason::CpuReset); self.power_off_host(true); } PowerState::A0 | PowerState::A0PlusHP | PowerState::A0Thermtrip => { + // Clear the last power-off, as we have now reached A0; + // subsequent power-offs will set a new reason. + self.last_power_off = None; // TODO should we clear self.reboot_state here? What if we // transitioned from one A0 state to another? For now, leave it // set, and we'll move back to A0 whenever we transition to @@ -795,6 +815,9 @@ impl ServerImpl { Some(response) } HostToSp::HostBootFailure { reason } => { + // Indicate that the host boot failed, so that we can then tell + // sequencer why we are asking it to power off the system. + self.last_power_off = Some(StateChangeReason::HostBootFailure); // TODO forward to MGS // // For now, copy it into a static var we can pull out via @@ -812,6 +835,14 @@ impl ServerImpl { Some(SpToHost::Ack) } HostToSp::HostPanic => { + // Indicate that a subsequent power-off request will be due to a + // panic. Since a host panic message is also sent after a host + // boot failure, only set the last power-off reason if we + // haven't just seen a boot failure. + if self.last_power_off.is_none() { + self.last_power_off = Some(StateChangeReason::HostPanic); + } + // TODO forward to MGS // // For now, copy it into a static var we can pull out via @@ -1354,6 +1385,10 @@ impl NotificationHandler for ServerImpl { handle_reboot_waiting_in_a2_timer( &self.sequencer, &mut self.reboot_state, + // If the last power-off did not set a reason, it must + // be a host-requested reboot. + self.last_power_off + .unwrap_or(StateChangeReason::HostReboot), ); } Timers::TxPeriodicZeroByte => { @@ -1407,6 +1442,7 @@ fn parse_received_message( fn handle_reboot_waiting_in_a2_timer( sequencer: &Sequencer, reboot_state: &mut Option, + why: StateChangeReason, ) { // If we're past the deadline for transitioning to A0, attempt to do so. if let Some(RebootState::WaitingInA2RebootDelay) = reboot_state { @@ -1418,9 +1454,10 @@ fn handle_reboot_waiting_in_a2_timer( // we've done what we can to reboot, so clear out `reboot_state`. ringbuf_entry!(Trace::SetState { now: sys_get_timer().now, + why, state: PowerState::A0, }); - _ = sequencer.set_state(PowerState::A0); + _ = sequencer.set_state_with_reason(PowerState::A0, why); *reboot_state = None; } } diff --git a/task/thermal/src/bsp/gimlet_bcdef.rs b/task/thermal/src/bsp/gimlet_bcdef.rs index 220a2fe2d..ba08f04c4 100644 --- a/task/thermal/src/bsp/gimlet_bcdef.rs +++ b/task/thermal/src/bsp/gimlet_bcdef.rs @@ -12,7 +12,7 @@ use crate::{ i2c_config::{devices, sensors}, }; pub use drv_cpu_seq_api::SeqError; -use drv_cpu_seq_api::{PowerState, Sequencer}; +use drv_cpu_seq_api::{PowerState, Sequencer, StateChangeReason}; use task_sensor_api::SensorId; use task_thermal_api::ThermalProperties; use userlib::{task_slot, units::Celsius, TaskId, UnwrapLite}; @@ -109,7 +109,8 @@ impl Bsp { } pub fn power_down(&self) -> Result<(), SeqError> { - self.seq.set_state(PowerState::A2) + self.seq + .set_state_with_reason(PowerState::A2, StateChangeReason::Overheat) } pub fn power_mode(&self) -> PowerBitmask {