Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new workloads #25106

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 24 additions & 28 deletions enos/enos-scenario-upgrade.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ scenario "upgrade" {
module = module.provision_cluster
variables {
name = local.cluster_name
nomad_local_binary = step.copy_initial_binary.nomad_local_binary
nomad_local_binary = step.copy_initial_binary.binary_path[matrix.os]
nomad_local_binary_server = step.copy_initial_binary.binary_path[local.server_os]
Comment on lines +68 to +69
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should rebase this PR on #25172 once that's been merged.

server_count = var.server_count
client_count_linux = local.linux_count
client_count_windows_2016 = local.windows_count
Expand All @@ -91,6 +92,14 @@ scenario "upgrade" {
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
workloads = {
service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" }
system_docker = { job_spec = "jobs/docker-system.nomad.hcl", alloc_count = 0, type = "system" }
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The alloc count for system jobs doesn't really matter, its set to 0 to emphasis it

batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" }
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
}
}

verifies = [
Expand Down Expand Up @@ -150,8 +159,8 @@ scenario "upgrade" {
arch = local.arch
edition = matrix.edition
product_version = var.upgrade_version
os = matrix.os
download_binary = false
oss = [local.server_os, matrix.os]
download_binaries = false
}
}

Expand Down Expand Up @@ -193,8 +202,8 @@ scenario "upgrade" {
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url
artifact_sha = step.fetch_upgrade_binary.artifact_sha
artifact_url = step.fetch_upgrade_binary.artifact_url[local.server_os]
artifact_sha = step.fetch_upgrade_binary.artifact_sha[local.server_os]
}
}

Expand Down Expand Up @@ -235,27 +244,6 @@ scenario "upgrade" {
]
}

/* step "run_workloads" {
depends_on = [step.server_upgrade_test_cluster_health]

description = <<-EOF
Verify the health of the cluster by running new workloads
EOF

module = module.run_workloads
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
}

verifies = [
quality.nomad_register_job,
]
}
*/
step "upgrade_clients" {
depends_on = [step.server_upgrade_test_cluster_health]

Expand Down Expand Up @@ -295,8 +283,8 @@ scenario "upgrade" {
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url
artifact_sha = step.fetch_upgrade_binary.artifact_sha
artifact_url = step.fetch_upgrade_binary.artifact_url[matrix.os]
artifact_sha = step.fetch_upgrade_binary.artifact_sha[matrix.os]
}
}

Expand Down Expand Up @@ -377,4 +365,12 @@ scenario "upgrade" {
value = step.provision_cluster.nomad_token
sensitive = true
}

output "binary_path" {
value = step.copy_initial_binary.binary_path
}

output "allocs" {
value = step.run_initial_workloads.allocs_count
}
}
30 changes: 30 additions & 0 deletions enos/modules/run_workloads/jobs/docker-batch.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "alloc_count" {
type = number
default = 1
}

job "batch-docker" {
type = "batch"

group "batch-docker" {
count = var.alloc_count

task "batch-docker" {
driver = "docker"

config {
image = "alpine:latest"
command = "sh"
args = ["-c", "while true; do sleep 30000; done"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Batch workloads are a little tricky -- we probably want to make sure that they can complete and not get rescheduled by client/server restarts, rather than having them wait forever. But that'll require some new assertion logic, so let's come back to that.


}

resources {
cpu = 50
memory = 64
}
}
}
}
4 changes: 2 additions & 2 deletions enos/modules/run_workloads/jobs/docker-service.nomad.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ job "service-docker" {
}

resources {
cpu = 100
memory = 128
cpu = 50
memory = 64
}
}
}
Expand Down
29 changes: 29 additions & 0 deletions enos/modules/run_workloads/jobs/docker-system.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "alloc_count" {
type = number
default = 1
}

job "system-docker" {
type = "system"

group "system-docker" {

task "system-docker" {
driver = "docker"

config {
image = "alpine:latest"
command = "sh"
args = ["-c", "while true; do sleep 30000; done"]
Comment on lines +17 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For service/system jobs, we should probably use a workload where we can assert more about it than "is the alloc running?". busybox httpd would let us run a network service, so that we're exercising things like restoring CNI. Ok to leave for this PR, but let's come back to this too.


}

resources {
cpu = 50
memory = 64
}
}
}
}
41 changes: 41 additions & 0 deletions enos/modules/run_workloads/jobs/raw-exec-batch.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

variable "alloc_count" {
type = number
default = 1
}

job "batch-raw-exec" {
type = "batch"

group "batch-raw-exec" {
count = var.alloc_count

task "batch-raw-exec" {
driver = "raw_exec"

config {
command = "bash"
args = ["-c", "./local/runme.sh"]
}

template {
data = <<EOH
#!/bin/bash

while true; do
sleep 30000
done
EOH
destination = "local/runme.sh"
perms = "755"
}

resources {
cpu = 50
memory = 64
}
}
}
}
5 changes: 5 additions & 0 deletions enos/modules/run_workloads/jobs/raw-exec-service.nomad.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ EOH
destination = "local/runme.sh"
perms = "755"
}

resources {
cpu = 50
memory = 64
}
}
}
}
40 changes: 40 additions & 0 deletions enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

variable "alloc_count" {
type = number
default = 1
}

job "system-raw-exec" {
type = "system"

group "system-raw-exec" {

task "system-raw-exec" {
driver = "raw_exec"

config {
command = "bash"
args = ["-c", "./local/runme.sh"]
}

template {
data = <<EOH
#!/bin/bash

while true; do
sleep 30000
done
EOH
destination = "local/runme.sh"
perms = "755"
}

resources {
cpu = 50
memory = 64
}
}
}
}
27 changes: 25 additions & 2 deletions enos/modules/run_workloads/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ terraform {
}

locals {
nomad_env = { NOMAD_ADDR = var.nomad_addr
nomad_env = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token }
NOMAD_TOKEN = var.nomad_token
}

system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
service_batch_allocs = sum([for wl in var.workloads : wl.alloc_count])
}

resource "enos_local_exec" "wait_for_nomad_api" {
Expand All @@ -23,6 +28,24 @@ resource "enos_local_exec" "wait_for_nomad_api" {
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
}

resource "enos_local_exec" "get_nodes" {
environment = local.nomad_env

inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"]
}

resource "enos_local_exec" "get_jobs" {
environment = local.nomad_env

inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"]
}

resource "enos_local_exec" "get_allocs" {
environment = local.nomad_env

inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"]
}

resource "enos_local_exec" "workloads" {
for_each = var.workloads

Expand Down
21 changes: 16 additions & 5 deletions enos/modules/run_workloads/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

/* output "jobs_count" {
value = length(local.job_names)
} */

output "jobs_count" {
description = "The number of jobs thar should be running in the cluster"
value = length(var.workloads) + chomp(enos_local_exec.get_jobs.stdout)
}

output "new_jobs_count" {
description = "The number of jobs that were triggered by the module"
value = length(var.workloads)
}

output "allocs_count" {
description = "The number of allocs that should be running in the cluster"
value = sum([for wl in var.workloads : wl.alloc_count])
value = local.system_job_count * chomp(enos_local_exec.get_nodes.stdout) + local.service_batch_allocs + chomp(enos_local_exec.get_allocs.stdout)
}

output "nodes" {
description = "Number of current clients in the cluster"
value = chomp(enos_local_exec.get_nodes.stdout)
}

output "new_allocs_count" {
description = "The number of allocs that should be running in the cluster"
value = local.system_job_count * chomp(enos_local_exec.get_nodes.stdout) + local.service_batch_allocs
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this output for? It doesn't match allocs_count

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im trying to make this module aware of existing allocs so it outputs all the running allocs, new and old, and the output can be directly used for a next step in enos, because it does not accept functions as step.variables, so this output helps me debug

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! Let's update the descriptions to make that clear. Right now this is the same as allocs_count.

6 changes: 1 addition & 5 deletions enos/modules/run_workloads/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ variable "workloads" {
type = map(object({
job_spec = string
alloc_count = number
type = string
}))

default = {
service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3 }
service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3 }
}
}
6 changes: 5 additions & 1 deletion enos/modules/test_cluster_health/scripts/allocs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,17 @@ while true; do
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
fi

echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
echo "Running allocs: $running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

echo "All ALLOCS are running."

if [ "$allocs_length" -eq 0 ]; then
exit 0
fi

# Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled

random_index=$((RANDOM % allocs_length))
Expand Down
2 changes: 1 addition & 1 deletion enos/modules/test_cluster_health/scripts/jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ if [ -z "$jobs_length" ]; then
fi

if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
error_exit "The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)\n$(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}')"
error_exit "The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT) $(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}') "
fi

echo "All JOBS are running."