Skip to content

Commit

Permalink
fix: add nonreporting server metrics (#1763)
Browse files Browse the repository at this point in the history
<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
  • Loading branch information
MasterPtato committed Jan 2, 2025
1 parent de64037 commit 3fbcdbd
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 35 deletions.
34 changes: 23 additions & 11 deletions Cargo.toml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions packages/services/cluster/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod ops;
pub mod types;
pub mod util;
pub mod workflows;
pub mod metrics;

pub fn registry() -> WorkflowResult<Registry> {
use workflows::*;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,11 @@ lazy_static::lazy_static! {
PROVISION_BUCKETS.to_vec(),
*REGISTRY,
).unwrap();

pub static ref NONREPORTING_SERVER: IntGaugeVec = register_int_gauge_vec_with_registry!(
"provision_nonreporting_server",
"Servers without reporting Prometheus metrics.",
&["cluster_id", "datacenter_id", "server_id", "provider_datacenter_id", "pool_type"],
*REGISTRY,
).unwrap();
}
36 changes: 22 additions & 14 deletions packages/services/cluster/src/ops/datacenter/topology_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ pub struct Server {
pub pool_type: PoolType,
pub usage: Stats,
pub limits: Stats,
/// Whether or not metrics for this server could not be found and ended up defaulting
pub missing: bool,
}

#[derive(Clone, Debug, Default)]
Expand Down Expand Up @@ -290,7 +292,7 @@ pub async fn cluster_datacenter_topology_get(
// We assume a server has the default memory
// amount (memory of the first hardware in the list) if it is not provisioned yet

let (usage, limits) = match &server.runtime {
let (usage, limits, missing) = match &server.runtime {
Runtime::Nomad(nomad_node_id) => {
// Gracefully handle if node does not exist in API response
if let Some(node) = node_info.iter().find(|node| {
Expand Down Expand Up @@ -353,30 +355,33 @@ pub async fn cluster_datacenter_topology_get(
bandwidth: 0, // TODO:
};

(usage, limits)
(usage, limits, false)
} else {
tracing::error!(%nomad_node_id, "node not found in nomad response");

(Stats::default(), Stats::default())
(Stats::default(), Stats::default(), true)
}
}
Runtime::Pegboard(pegboard_client_id) => {
// Gracefully handle if client usage exists
let usage = if let Some(client) = pb_client_usage_res
let (usage, missing) = if let Some(client) = pb_client_usage_res
.clients
.iter()
.find(|client| &client.client_id == pegboard_client_id)
{
Stats {
cpu: client.usage.cpu,
memory: client.usage.memory,
disk: client.usage.disk,
bandwidth: 0, // TODO:
}
(
Stats {
cpu: client.usage.cpu,
memory: client.usage.memory,
disk: client.usage.disk,
bandwidth: 0, // TODO:
},
false,
)
} else {
tracing::error!(%pegboard_client_id, "pegboard client not found in response");

Stats::default()
(Stats::default(), true)
};

(
Expand All @@ -386,19 +391,20 @@ pub async fn cluster_datacenter_topology_get(
server.provider_hardware.as_deref(),
default_provider_hardware,
)?,
missing,
)
}
Runtime::None => {
// Gracefully handle if prometheus metrics exist
let usage = if let Some(server_metrics) = prometheus_metrics
let (usage, missing) = if let Some(server_metrics) = prometheus_metrics
.as_ref()
.and_then(|x| x.get(&server.server_id))
{
server_metrics.clone()
(server_metrics.clone(), false)
} else {
tracing::warn!(server_id=%server.server_id, "no prometheus metrics for server");

Stats::default()
(Stats::default(), true)
};

(
Expand All @@ -408,6 +414,7 @@ pub async fn cluster_datacenter_topology_get(
server.provider_hardware.as_deref(),
default_provider_hardware,
)?,
missing,
)
}
};
Expand All @@ -418,6 +425,7 @@ pub async fn cluster_datacenter_topology_get(
pool_type: server.pool_type,
usage,
limits,
missing,
});
}

Expand Down
1 change: 0 additions & 1 deletion packages/services/cluster/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ use cloudflare::{endpoints as cf, framework as cf_framework};

use crate::types::PoolType;

pub mod metrics;
pub mod test;

// Use the hash of the server install script in the image variant so that if the install scripts are updated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use ssh2::Session;

use crate::{
types::{Datacenter, PoolType},
util::metrics,
metrics,
};

mod install_scripts;
Expand Down
2 changes: 1 addition & 1 deletion packages/services/cluster/src/workflows/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub(crate) mod undrain;

use crate::{
types::{Pool, PoolType, Provider},
util::metrics,
metrics,
};

#[derive(Debug, Serialize, Deserialize)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::convert::{TryFrom, TryInto};
use chirp_workflow::prelude::*;
use cluster::{
types::{Datacenter, PoolType},
util::metrics,
metrics,
};

pub async fn start(config: rivet_config::Config, pools: rivet_pools::Pools) -> GlobalResult<()> {
Expand Down
6 changes: 3 additions & 3 deletions packages/services/pegboard/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
use rivet_metrics::{prometheus::*, REGISTRY};

lazy_static::lazy_static! {
pub static ref PEGBOARD_CLIENT_DUPLICATE_EVENT: IntCounterVec = register_int_counter_vec_with_registry!(
pub static ref CLIENT_DUPLICATE_EVENT: IntCounterVec = register_int_counter_vec_with_registry!(
"pegboard_client_duplicate_event",
"Duplicate client event that was attempted to be inserted.",
&["client_id", "index"],
*REGISTRY
).unwrap();

pub static ref PEGBOARD_CLIENT_LAST_PING: IntGaugeVec = register_int_gauge_vec_with_registry!(
pub static ref CLIENT_LAST_PING: IntGaugeVec = register_int_gauge_vec_with_registry!(
"pegboard_client_last_ping",
"Last client ping timestamp, in ms.",
&["client_id"],
*REGISTRY
).unwrap();

pub static ref PEGBOARD_CLIENT_ACTORS_ALLOCATED: IntCounterVec = register_int_counter_vec_with_registry!(
pub static ref CLIENT_ACTORS_ALLOCATED: IntCounterVec = register_int_counter_vec_with_registry!(
"pegboard_client_actors_allocated",
"Total actors allocated on a client.",
&["client_id"],
Expand Down
2 changes: 1 addition & 1 deletion packages/services/pegboard/src/workflows/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ async fn insert_events(ctx: &ActivityCtx, input: &InsertEventsInput) -> GlobalRe
continue;
}

metrics::PEGBOARD_CLIENT_DUPLICATE_EVENT
metrics::CLIENT_DUPLICATE_EVENT
.with_label_values(&[&input.client_id.to_string(), &event.index.to_string()])
.inc();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ pub async fn run_from_env(
)?;

for (client_id, last_ping_ts) in client_ping {
pegboard::metrics::PEGBOARD_CLIENT_LAST_PING
pegboard::metrics::CLIENT_LAST_PING
.with_label_values(&[&client_id.to_string()])
.set(last_ping_ts);
}

for (client_id, count) in client_actors {
pegboard::metrics::PEGBOARD_CLIENT_ACTORS_ALLOCATED
pegboard::metrics::CLIENT_ACTORS_ALLOCATED
.with_label_values(&[&client_id.to_string()])
.set(count.try_into()?);
}
Expand Down

0 comments on commit 3fbcdbd

Please sign in to comment.