Skip to content

Commit

Permalink
Add reason to TenantState::Broken (#3954)
Browse files Browse the repository at this point in the history
Reason and backtrace are added to the Broken state. Backtrace is automatically collected when tenant entered the broken state. The format for API, CLI and metrics is changed and unified to return tenant state name in camel case. Previously snake case was used for metrics and camel case was used for everything else. Now tenant state field in TenantInfo swagger spec is changed to contain state name in "slug" field and other fields (currently only reason and backtrace for Broken variant in "data" field). To allow for this breaking change state was removed from TenantInfo swagger spec because it was not used anywhere.

Please note that the tenant's broken reason is not persisted on disk so the reason is lost when pageserver is restarted.

Requires changes to grafana dashboard that monitors tenant states.

Closes #3001

---------

Co-authored-by: theirix <theirix@gmail.com>
  • Loading branch information
LizardWizzard and theirix authored Apr 13, 2023
1 parent 732acc5 commit 15d1f85
Show file tree
Hide file tree
Showing 19 changed files with 181 additions and 99 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion libs/pageserver_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ license.workspace = true
[dependencies]
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
const_format.workspace = true
anyhow.workspace = true
bytes.workspace = true
byteorder.workspace = true
utils.workspace = true
postgres_ffi.workspace = true
enum-map.workspace = true
serde_json.workspace = true
strum.workspace = true
strum_macros.workspace = true

workspace_hack.workspace = true
106 changes: 87 additions & 19 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use strum_macros;
use utils::{
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
Expand All @@ -18,11 +19,23 @@ use anyhow::bail;
use bytes::{BufMut, Bytes, BytesMut};

/// A state of a tenant in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(
Clone,
PartialEq,
Eq,
serde::Serialize,
serde::Deserialize,
strum_macros::Display,
strum_macros::EnumString,
strum_macros::EnumVariantNames,
strum_macros::AsRefStr,
strum_macros::IntoStaticStr,
)]
#[serde(tag = "slug", content = "data")]
pub enum TenantState {
// This tenant is being loaded from local disk
/// This tenant is being loaded from local disk
Loading,
// This tenant is being downloaded from cloud storage.
/// This tenant is being downloaded from cloud storage.
Attaching,
/// Tenant is fully operational
Active,
Expand All @@ -31,15 +44,7 @@ pub enum TenantState {
Stopping,
/// A tenant is recognized by the pageserver, but can no longer be used for
/// any operations, because it failed to be activated.
Broken,
}

pub mod state {
pub const LOADING: &str = "loading";
pub const ATTACHING: &str = "attaching";
pub const ACTIVE: &str = "active";
pub const STOPPING: &str = "stopping";
pub const BROKEN: &str = "broken";
Broken { reason: String, backtrace: String },
}

impl TenantState {
Expand All @@ -49,17 +54,26 @@ impl TenantState {
Self::Attaching => true,
Self::Active => false,
Self::Stopping => false,
Self::Broken => false,
Self::Broken { .. } => false,
}
}

pub fn as_str(&self) -> &'static str {
pub fn broken_from_reason(reason: String) -> Self {
let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
Self::Broken {
reason,
backtrace: backtrace_str,
}
}
}

impl std::fmt::Debug for TenantState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TenantState::Loading => state::LOADING,
TenantState::Attaching => state::ATTACHING,
TenantState::Active => state::ACTIVE,
TenantState::Stopping => state::STOPPING,
TenantState::Broken => state::BROKEN,
Self::Broken { reason, backtrace } if !reason.is_empty() => {
write!(f, "Broken due to: {reason}. Backtrace:\n{backtrace}")
}
_ => write!(f, "{self}"),
}
}
}
Expand Down Expand Up @@ -615,6 +629,7 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use bytes::Buf;
use serde_json::json;

use super::*;

Expand Down Expand Up @@ -665,4 +680,57 @@ mod tests {
assert!(msg == reconstructed);
}
}

#[test]
fn test_tenantinfo_serde() {
// Test serialization/deserialization of TenantInfo
let original_active = TenantInfo {
id: TenantId::generate(),
state: TenantState::Active,
current_physical_size: Some(42),
has_in_progress_downloads: Some(false),
};
let expected_active = json!({
"id": original_active.id.to_string(),
"state": {
"slug": "Active",
},
"current_physical_size": 42,
"has_in_progress_downloads": false,
});

let original_broken = TenantInfo {
id: TenantId::generate(),
state: TenantState::Broken {
reason: "reason".into(),
backtrace: "backtrace info".into(),
},
current_physical_size: Some(42),
has_in_progress_downloads: Some(false),
};
let expected_broken = json!({
"id": original_broken.id.to_string(),
"state": {
"slug": "Broken",
"data": {
"backtrace": "backtrace info",
"reason": "reason",
}
},
"current_physical_size": 42,
"has_in_progress_downloads": false,
});

assert_eq!(
serde_json::to_value(&original_active).unwrap(),
expected_active
);

assert_eq!(
serde_json::to_value(&original_broken).unwrap(),
expected_broken
);
assert!(format!("{:?}", &original_broken.state).contains("reason"));
assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
}
}
3 changes: 0 additions & 3 deletions pageserver/src/http/openapi_spec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -829,12 +829,9 @@ components:
type: object
required:
- id
- state
properties:
id:
type: string
state:
type: string
current_physical_size:
type: integer
has_in_progress_downloads:
Expand Down
6 changes: 3 additions & 3 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
state: *state,
state: state.clone(),
current_physical_size: None,
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
Expand All @@ -490,7 +490,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
let state = tenant.current_state();
Ok(TenantInfo {
id: tenant_id,
state,
state: state.clone(),
current_physical_size: Some(current_physical_size),
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
Expand Down Expand Up @@ -931,7 +931,7 @@ async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiErro
.await
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

tenant.set_broken("broken from test");
tenant.set_broken("broken from test".to_owned());

json_response(StatusCode::OK, ())
}
Expand Down
14 changes: 3 additions & 11 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use metrics::{
UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::models::state;
use pageserver_api::models::TenantState;
use strum::VariantNames;
use utils::id::{TenantId, TimelineId};

/// Prometheus histogram buckets (in seconds) for operations in the critical
Expand Down Expand Up @@ -147,15 +148,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});

// Metrics collected on tenant states.
const TENANT_STATE_OPTIONS: &[&str] = &[
state::LOADING,
state::ATTACHING,
state::ACTIVE,
state::STOPPING,
state::BROKEN,
];

pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
Expand Down Expand Up @@ -707,7 +699,7 @@ impl Drop for TimelineMetrics {
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
let tid = tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
for state in TENANT_STATE_OPTIONS {
for state in TenantState::VARIANTS {
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
}
}
Expand Down
Loading

0 comments on commit 15d1f85

Please sign in to comment.