Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-enable GPU profiles for GCP/AWS #1219

Merged
merged 11 commits into from
May 9, 2022
23 changes: 23 additions & 0 deletions qhub/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,33 @@ class NodeGroup(Base):
min_nodes: int
max_nodes: int
gpu: typing.Optional[bool] = False
guest_accelerators: typing.Optional[typing.List[typing.Dict]] = []

class Config:
extra = "allow"

@validator("guest_accelerators")
def validate_guest_accelerators(cls, v):
if not v:
return v
if not isinstance(v, list):
raise ValueError("guest_accelerators must be a list")
for i in v:
assertion_error_message = """
In order to successfully use guest accelerators, you must specify the following parameters:

name (str): Machine type name of the GPU, available at https://cloud.google.com/compute/docs/gpus
count (int): Number of GPUs to attach to the instance

See general information regarding GPU support at:
https://docs.qhub.dev/en/stable/source/admin_guide/gpu.html?#add-gpu-node-group
"""
try:
assert "name" in i and "count" in i
assert isinstance(i["name"], str) and isinstance(i["count"], int)
except AssertionError:
raise ValueError(assertion_error_message)


class DigitalOceanProvider(Base):
region: str
Expand Down
22 changes: 22 additions & 0 deletions qhub/stages/input_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def stage_02_infrastructure(stage_outputs, config):
"instance_type": value["instance"],
"min_size": value["min_nodes"],
"max_size": value["max_nodes"],
"guest_accelerators": value["guest_accelerators"]
viniciusdc marked this conversation as resolved.
Show resolved Hide resolved
if "guest_accelerators" in value
else [],
**value,
}
for key, value in config["google_cloud_platform"]["node_groups"].items()
Expand Down Expand Up @@ -105,6 +108,23 @@ def stage_02_infrastructure(stage_outputs, config):


def stage_03_kubernetes_initialize(stage_outputs, config):
if config["provider"] == "gcp":
gpu_enabled = any(
node_group.get("guest_accelerators")
for node_group in config["google_cloud_platform"]["node_groups"].values()
)
gpu_node_group_names = []

elif config["provider"] == "aws":
gpu_enabled = any(
node_group.get("gpu")
for node_group in config["amazon_web_services"]["node_groups"].values()
)
gpu_node_group_names = [config["amazon_web_services"]["node_groups"].keys()]
else:
gpu_enabled = False
gpu_node_group_names = []

return {
"name": config["project_name"],
"environment": config["namespace"],
Expand All @@ -113,6 +133,8 @@ def stage_03_kubernetes_initialize(stage_outputs, config):
"external_container_reg": config.get(
"external_container_reg", {"enabled": False}
),
"gpu_enabled": gpu_enabled,
"gpu_node_group_names": gpu_node_group_names,
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ locals {
])

merged_node_groups = [for node_group in var.node_groups : merge(var.node_group_defaults, node_group)]

viniciusdc marked this conversation as resolved.
Show resolved Hide resolved
# gpu_node_group_names = concat([for node_group in local.merged_node_groups : node_group.guest_accelerators])
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ resource "google_container_node_pool" "main" {
for_each = local.merged_node_groups[count.index].guest_accelerators

content {
type = guest_accelerator.value.type
type = guest_accelerator.value.name
count = guest_accelerator.value.count
}
}
Expand Down

This file was deleted.

10 changes: 10 additions & 0 deletions qhub/template/stages/03-kubernetes-initialize/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,13 @@ module "kubernetes-autoscaling" {
module "traefik-crds" {
source = "./modules/traefik_crds"
}

module "nvidia-driver-installer" {
count = var.gpu_enabled ? 1 : 0

source = "./modules/nvidia-installer"

cloud-provider = var.cloud-provider
gpu_enabled = var.gpu_enabled
gpu_node_group_names = var.gpu_node_group_names
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
resource "kubernetes_daemonset" "nvidia_installer" {
count = length(local.gpu_node_group_names) == 0 ? 0 : 1
resource "kubernetes_daemonset" "aws_nvidia_installer" {
count = var.gpu_enabled && (var.cloud-provider == "aws") ? 1 : 0
metadata {
name = "nvidia-device-plugin-daemonset-1.12"
namespace = "kube-system"
Expand Down Expand Up @@ -27,7 +27,7 @@ resource "kubernetes_daemonset" "nvidia_installer" {
match_expressions {
key = "eks.amazonaws.com/nodegroup"
operator = "In"
values = local.gpu_node_group_names
values = var.gpu_node_group_names
}
}
}
Expand Down
Loading