Skip to content

Commit

Permalink
fix(clusters): add drain padding to nomad (#1100)
Browse files Browse the repository at this point in the history
<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
  • Loading branch information
MasterPtato committed Sep 3, 2024
1 parent b460374 commit 01ee21b
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 5 deletions.
2 changes: 1 addition & 1 deletion fern/definition/admin/clusters/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ types:
desired_count: integer
min_count: integer
max_count: integer
drain_timeout: long
drain_timeout_ms: long

Hardware:
properties:
Expand Down
3 changes: 3 additions & 0 deletions lib/bolt/cli/src/commands/cluster/datacenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ mod render {
#[tabled(display_with = "display_pool_type")]
pub pool_type: Option<models::AdminClustersPoolType>,
#[tabled(display_with = "display_option")]
pub drain_timeout: Option<String>,
#[tabled(display_with = "display_option")]
pub min_count: Option<i32>,
#[tabled(display_with = "display_option")]
pub desired_count: Option<i32>,
Expand All @@ -303,6 +305,7 @@ mod render {
.chain(d.pools.iter().cloned().map(|pool| DcTableRow {
pool: PoolTableRow {
pool_type: Some(pool.pool_type),
drain_timeout: Some(format!("{}s", pool.drain_timeout / 1000)),
min_count: Some(pool.min_count),
desired_count: Some(pool.desired_count),
max_count: Some(pool.max_count),
Expand Down
44 changes: 41 additions & 3 deletions svc/pkg/cluster/src/workflows/server/drain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ use serde_json::json;

use crate::types::PoolType;

// In ms, a small amount of time to separate the completion of the drain in Nomad to the deletion of the
// cluster server. We want the Nomad drain to complete first.
const NOMAD_DRAIN_PADDING: u64 = 10000;

lazy_static::lazy_static! {
static ref NOMAD_CONFIG: Configuration = nomad_util::new_config_from_env().unwrap();
}
Expand All @@ -17,18 +21,24 @@ pub(crate) struct Input {
pub datacenter_id: Uuid,
pub server_id: Uuid,
pub pool_type: PoolType,
pub drain_timeout: u64,
}

#[workflow]
pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResult<()> {
let drain_timeout = ctx
.activity(GetDrainTimeoutInput {
datacenter_id: input.datacenter_id,
pool_type: input.pool_type.clone(),
})
.await?;

match input.pool_type {
PoolType::Job => {
let started_drain = ctx
.activity(DrainNodeInput {
datacenter_id: input.datacenter_id,
server_id: input.server_id,
drain_timeout: input.drain_timeout,
drain_timeout,
})
.await?;

Expand Down Expand Up @@ -57,6 +67,32 @@ pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) -
Ok(())
}

#[derive(Debug, Serialize, Deserialize, Hash)]
pub(crate) struct GetDrainTimeoutInput {
pub datacenter_id: Uuid,
pub pool_type: PoolType,
}

#[activity(GetDrainTimeout)]
pub(crate) async fn get_drain_timeout(
ctx: &ActivityCtx,
input: &GetDrainTimeoutInput,
) -> GlobalResult<u64> {
let dcs_res = ctx
.op(crate::ops::datacenter::get::Input {
datacenter_ids: vec![input.datacenter_id],
})
.await?;
let dc = unwrap!(dcs_res.datacenters.into_iter().next());

let pool = unwrap!(
dc.pools.iter().find(|p| p.pool_type == input.pool_type),
"datacenter does not have this type of pool configured"
);

Ok(pool.drain_timeout)
}

#[derive(Debug, Serialize, Deserialize, Hash)]
struct DrainNodeInput {
datacenter_id: Uuid,
Expand Down Expand Up @@ -85,7 +121,9 @@ async fn drain_node(ctx: &ActivityCtx, input: &DrainNodeInput) -> GlobalResult<b
models::NodeUpdateDrainRequest {
drain_spec: Some(Box::new(models::DrainSpec {
// In nanoseconds. `drain_timeout` must be below 292 years for this to not overflow
deadline: Some((input.drain_timeout * 1000000) as i64),
deadline: Some(
(input.drain_timeout.saturating_sub(NOMAD_DRAIN_PADDING) * 1000000) as i64,
),
ignore_system_jobs: None,
})),
mark_eligible: None,
Expand Down
1 change: 0 additions & 1 deletion svc/pkg/cluster/src/workflows/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
datacenter_id: input.datacenter_id,
server_id: input.server_id,
pool_type: input.pool_type.clone(),
drain_timeout: pool.drain_timeout,
})
.await?;
}
Expand Down

0 comments on commit 01ee21b

Please sign in to comment.