Skip to content

Commit

Permalink
fix: datacenter taint draining too soon, datacenter update not updati…
Browse files Browse the repository at this point in the history
…ng drain timeout
  • Loading branch information
MasterPtato authored and NathanFlurry committed May 4, 2024
1 parent cea1fe7 commit cbea4d6
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 8 deletions.
4 changes: 2 additions & 2 deletions infra/tf/k8s_infra/grafana_dashboards/provisioning.json
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "last_over_time((\n\tsum by (server_id) (\n\t\tirate(\n\t\t\tnode_cpu_seconds_total{\n\t\t\t\tdatacenter_id=~\"[[datacenter_id]]\",\n\t\t\t\tpool_type=\"gg\",\n\n\t\t\t\tmode!=\"idle\",\n\t\t\t\tmode!=\"iowait\",\n\t\t\t\tmode!=\"steal\"\n\t\t\t}\n\t\t\t[5m]\n\t\t)\n\t) * 100\n) [15m:15s])",
"expr": "sum by (server_id) (\n\tirate(\n\t\tnode_cpu_seconds_total{\n\t\t\tdatacenter_id=~\"[[datacenter_id]]\",\n\t\t\tpool_type=\"gg\",\n\n\t\t\tmode!=\"idle\",\n\t\t\tmode!=\"iowait\",\n\t\t\tmode!=\"steal\"\n\t\t}\n\t\t[5m]\n\t)\n) * 100",
"instant": false,
"interval": "",
"legendFormat": "__auto",
Expand Down Expand Up @@ -1390,7 +1390,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "last_over_time((\n\tsum by (server_id) (\n\t\tirate(\n\t\t\tnode_cpu_seconds_total{\n\t\t\t\tdatacenter_id=~\"[[datacenter_id]]\",\n\t\t\t\tpool_type=\"ats\",\n\n\t\t\t\tmode!=\"idle\",\n\t\t\t\tmode!=\"iowait\",\n\t\t\t\tmode!=\"steal\"\n\t\t\t}\n\t\t\t[5m]\n\t\t)\n\t) * 100\n) [15m:15s])",
"expr": "sum by (server_id) (\n\tirate(\n\t\tnode_cpu_seconds_total{\n\t\t\tdatacenter_id=~\"[[datacenter_id]]\",\n\t\t\tpool_type=\"ats\",\n\n\t\t\tmode!=\"idle\",\n\t\t\tmode!=\"iowait\",\n\t\t\tmode!=\"steal\"\n\t\t}\n\t\t[5m]\n\t)\n) * 100",
"instant": false,
"interval": "",
"legendFormat": "__auto",
Expand Down
1 change: 1 addition & 0 deletions lib/bolt/cli/src/commands/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ impl SubCommand {

// Load namespace config
let ns_config = if fs::metadata(&namespace_path).await.is_ok() {
// TODO (RVT-3747): Parse as plain toml
let ns_config =
ProjectContextData::read_ns(project_root.as_path(), &ns_id).await;

Expand Down
13 changes: 13 additions & 0 deletions svc/pkg/cluster/standalone/gc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
.find(|pool| pool.pool_type == server.pool_type as i32));
let drain_completed = server.drain_ts < ts - pool.drain_timeout as i64;

tracing::info!(
server_id=?server.server_id,
drain_ts=%server.drain_ts,
pool_drain_timeout=%pool.drain_timeout,
%drain_completed,
);

Ok((server, drain_completed))
})
.filter(|res| {
Expand All @@ -80,6 +87,12 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
})
.collect::<GlobalResult<Vec<_>>>()?;

if drained_servers.is_empty() {
return Ok(Vec::new());
}

tracing::info!(count=%drained_servers.len(), "servers done draining");

// Update servers that have completed draining
sql_execute!(
[ctx, @tx tx]
Expand Down
2 changes: 1 addition & 1 deletion svc/pkg/cluster/standalone/metrics-publish/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async fn start() -> GlobalResult<()> {
.name("cluster_metrics_publish::metrics")
.spawn(rivet_metrics::run_standalone())?;

let mut interval = tokio::time::interval(Duration::from_secs(15));
let mut interval = tokio::time::interval(Duration::from_secs(7));
loop {
interval.tick().await;

Expand Down
27 changes: 22 additions & 5 deletions svc/pkg/cluster/worker/src/workers/datacenter_scale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ async fn drain_tainted_servers(
servers: &[Server],
pctx: &PoolCtx,
) -> GlobalResult<()> {
// Includes tainted and normal servers
let active_servers_in_pool = servers
.iter()
.filter(|server| server.pool_type == pctx.pool_type)
Expand All @@ -415,25 +416,35 @@ async fn drain_tainted_servers(
let active_tainted_servers_in_pool = active_servers_in_pool
.clone()
.filter(|server| server.is_tainted);
let active_tainted_count = active_tainted_servers_in_pool.clone().count();

// For job servers the "active" servers we count are ones with nomad successfully connected. Otherwise we
// count servers that have successfully installed
let relevant_active_count = match pctx.pool_type {
let active_untainted_count = match pctx.pool_type {
backend::cluster::PoolType::Job => active_servers_in_pool
.clone()
.filter(|server| server.has_nomad_node)
.filter(|server| !server.is_tainted)
.count(),
_ => active_servers_in_pool
.clone()
.filter(|server| server.is_installed)
.filter(|server| !server.is_tainted)
.count(),
};

let active_tainted_count = active_tainted_servers_in_pool.clone().count();

// tainted - (desired - running) -> tainted + running - desired
let drain_count =
(active_tainted_count + relevant_active_count).saturating_sub(pctx.desired_count);
(active_tainted_count + active_untainted_count).saturating_sub(pctx.desired_count);

tracing::info!(
?pctx.pool_type,
desired_count=%pctx.desired_count,
%active_untainted_count,
%active_tainted_count,
%drain_count,
"draining tainted servers",
);

drain_servers(
ctx,
Expand Down Expand Up @@ -465,7 +476,7 @@ async fn destroy_drained_servers(
return Ok(());
}

tracing::info!(count=%drained_server_ids.len(), "deleting drained servers");
tracing::info!(count=%drained_server_ids.len(), "destroying drained servers");

destroy_servers(ctx, tx, msgs, drained_server_ids.into_iter()).await
}
Expand All @@ -476,6 +487,8 @@ async fn drain_servers<I: Iterator<Item = Uuid> + Clone>(
msgs: &mut Vec<MsgFuture>,
server_ids: I,
) -> GlobalResult<()> {
tracing::info!(count=%server_ids.clone().count(), "draining servers");

// Mark servers as draining in db
sql_execute!(
[ctx, @tx tx]
Expand Down Expand Up @@ -511,6 +524,8 @@ async fn undrain_servers<I: Iterator<Item = Uuid> + Clone>(
msgs: &mut Vec<MsgFuture>,
server_ids: I,
) -> GlobalResult<()> {
tracing::info!(count=%server_ids.clone().count(), "undraining servers");

// Mark servers as not draining in db
sql_execute!(
[ctx, @tx tx]
Expand Down Expand Up @@ -595,6 +610,8 @@ async fn destroy_servers<I: Iterator<Item = Uuid> + Clone>(
msgs: &mut Vec<MsgFuture>,
server_ids: I,
) -> GlobalResult<()> {
tracing::info!(count=%server_ids.clone().count(), "destroying servers");

// Mark servers for destruction in db
sql_execute!(
[ctx, @tx tx]
Expand Down
3 changes: 3 additions & 0 deletions svc/pkg/cluster/worker/src/workers/datacenter_update.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ async fn worker(
if let Some(max_count) = pool.max_count {
current_pool.max_count = max_count;
}
if let Some(drain_timeout) = pool.drain_timeout {
current_pool.drain_timeout = drain_timeout;
}
}

// Encode config
Expand Down

0 comments on commit cbea4d6

Please sign in to comment.