From 83724fd508b113db5023d7f3014da011ecc99ed7 Mon Sep 17 00:00:00 2001 From: johannessc Date: Mon, 4 Dec 2023 13:01:56 +0100 Subject: [PATCH 1/6] feat: disabled gpu driver installation for gpu node pool and added gpu operator installation via helm --- modules/simphera_base/k8s.tf | 9 +++++- .../gpu_operator/gpu-operator-values.yaml | 28 +++++++++++++++++++ .../modules/gpu_operator/main.tf | 11 ++++++++ .../modules/gpu_operator/variables.tf | 13 +++++++++ modules/simphera_base/variables.tf | 6 ++++ 5 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml create mode 100644 modules/simphera_base/modules/gpu_operator/main.tf create mode 100644 modules/simphera_base/modules/gpu_operator/variables.tf diff --git a/modules/simphera_base/k8s.tf b/modules/simphera_base/k8s.tf index c0d8de4..99d98ac 100644 --- a/modules/simphera_base/k8s.tf +++ b/modules/simphera_base/k8s.tf @@ -173,7 +173,7 @@ resource "azurerm_kubernetes_cluster_node_pool" "gpu-execution-nodes" { "purpose=gpu:NoSchedule" ] - tags = var.tags + tags = merge(var.tags, { SkipGPUDriverInstall = "true" }) lifecycle { ignore_changes = [ @@ -186,6 +186,13 @@ resource "azurerm_kubernetes_cluster_node_pool" "gpu-execution-nodes" { } } +module "gpu-operator" { + count = var.gpuNodePool ? 1 : 0 + + source = "./modules/gpu_operator" + gpuDriverVersion = var.gpuDriverVersion +} + output "kube_config" { value = azurerm_kubernetes_cluster.aks.kube_config sensitive = true diff --git a/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml b/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml new file mode 100644 index 0000000..8bceafc --- /dev/null +++ b/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml @@ -0,0 +1,28 @@ +operator: + defaultRuntime: containerd + +dcgmExporter: + enabled: false + +driver: + enabled: true + version: ${driver_version} + +toolkit: + enabled: true + +#TODO: make gpu nodePool taint in k8s.tf variable and then also fill the tolerations here with the corresponding values +daemonsets: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule + +node-feature-discovery: + worker: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule \ No newline at end of file diff --git a/modules/simphera_base/modules/gpu_operator/main.tf b/modules/simphera_base/modules/gpu_operator/main.tf new file mode 100644 index 0000000..7c7a1e1 --- /dev/null +++ b/modules/simphera_base/modules/gpu_operator/main.tf @@ -0,0 +1,11 @@ +terraform { + required_version = ">= 1.0.0" +} + +resource "helm_release" "gpu-operator" { + name = var.helmReleaseName + repository = "https://helm.ngc.nvidia.com/nvidia" + chart = "gpu-operator" + version = var.helmChartVersion + values = [templatefile("gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] +} diff --git a/modules/simphera_base/modules/gpu_operator/variables.tf b/modules/simphera_base/modules/gpu_operator/variables.tf new file mode 100644 index 0000000..76e66f7 --- /dev/null +++ b/modules/simphera_base/modules/gpu_operator/variables.tf @@ -0,0 +1,13 @@ +variable "helmReleaseName" { + type = string + default = "gpu-operator" +} + +variable "helmChartVersion" { + type = string + default = "v23.9.0" +} + +variable "gpuDriverVersion" { + type = string +} diff --git a/modules/simphera_base/variables.tf b/modules/simphera_base/variables.tf index 185cbb3..f74905a 100644 --- a/modules/simphera_base/variables.tf +++ b/modules/simphera_base/variables.tf @@ -86,6 +86,12 @@ variable "gpuNodeDeallocate" { default = true } +variable "gpuDriverVersion" { + type = string + description = "GPU Driver Version that the gpu-operator uses." + default = "535.54.03" +} + variable "ssh_public_key_path" { type = string description = "Path to the public SSH key to be used for the kubernetes nodes." From 6d0d274292f2c95302770eae87848e9c44990502 Mon Sep 17 00:00:00 2001 From: johannessc Date: Mon, 4 Dec 2023 13:31:43 +0100 Subject: [PATCH 2/6] fix: fix --- modules/simphera_base/modules/gpu_operator/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/simphera_base/modules/gpu_operator/main.tf b/modules/simphera_base/modules/gpu_operator/main.tf index 7c7a1e1..38b0e71 100644 --- a/modules/simphera_base/modules/gpu_operator/main.tf +++ b/modules/simphera_base/modules/gpu_operator/main.tf @@ -7,5 +7,5 @@ resource "helm_release" "gpu-operator" { repository = "https://helm.ngc.nvidia.com/nvidia" chart = "gpu-operator" version = var.helmChartVersion - values = [templatefile("gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] + values = [templatefile("./gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] } From dee2dc1dd986551ff6b6574d5ae018bbc380dbe9 Mon Sep 17 00:00:00 2001 From: johannessc Date: Mon, 4 Dec 2023 13:37:37 +0100 Subject: [PATCH 3/6] fix: next fix --- modules/simphera_base/modules/gpu_operator/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/simphera_base/modules/gpu_operator/main.tf b/modules/simphera_base/modules/gpu_operator/main.tf index 38b0e71..4b8e1ce 100644 --- a/modules/simphera_base/modules/gpu_operator/main.tf +++ b/modules/simphera_base/modules/gpu_operator/main.tf @@ -7,5 +7,5 @@ resource "helm_release" "gpu-operator" { repository = "https://helm.ngc.nvidia.com/nvidia" chart = "gpu-operator" version = var.helmChartVersion - values = [templatefile("./gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] + values = [templatefile("${path.module}/gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] } From 50c40e6924a5489751093466b0b81c8839f00c1e Mon Sep 17 00:00:00 2001 From: johannessc Date: Wed, 13 Dec 2023 17:20:13 +0100 Subject: [PATCH 4/6] fix: added depends_on for gpu operator module --- README.md | 9 ++++++++- modules/simphera_base/k8s.tf | 1 + modules/simphera_base/modules/gpu_operator/main.tf | 12 +++++++----- .../simphera_base/modules/gpu_operator/variables.tf | 9 +++++++-- .../modules/simphera_instance/postgresql.tf | 4 ++-- 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d12a4f3..92603b6 100644 --- a/README.md +++ b/README.md @@ -135,9 +135,13 @@ List with description of all mandatory and optional variables could be find in t It is recommended to restrict the access to the Kubernetes API server using authorized IP address ranges by setting the variable `apiServerAuthorizedIpRanges`. It is recommended to restrict the access to the Key Vault using authorized IP address ranges by setting the variable `keyVaultAuthorizedIpRanges`. +## GPU Usage + +If you use AURELION with SIMPHERA then the AURELION Pods are executed in the GPU node pool. AURELION uses a specific OptiX Version and thus needs specific NVIDIA Drivers. NVIDIA provides the [gpu-operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html), a tool with which it is possible to use containerized drivers inside pods. This makes it possible to use the needed driver Versions independent of the default installation of the NVIDIA Drivers on the GPU node pool, which can only be not installed, selecting a version is not possible. [Further infomations](https://learn.microsoft.com/en-us/azure/aks/gpu-cluster#use-nvidia-gpu-operator-with-aks) + ### Scale Down Mode -If you use AURELION with SIMPHERA then the AURELION Pods are executed in the GPU node pool. Typically, you have autoscaling enabled for that pool so that VMs are scaled down if they are no longer needed. However, the AURELION container image is big and it takes time to download the image to the Kubernetes node. Depending on your location this can take more than 30 minutes. To shorten these times the _Scale Down Mode_ of the GPU node pool should be set to _Deallocate_. That means, that a GPU VM is not _deleted_ but only _deallocated_. So you no longer have to pay for the compute resources but only for the disk that will not be deleted when using this mode. +Typically, you have autoscaling enabled for the GPU node pool so that VMs are scaled down if they are no longer needed. However, the AURELION container image is big and it takes time to download the image to the Kubernetes node. Depending on your location this can take more than 30 minutes. To shorten these times the _Scale Down Mode_ of the GPU node pool should be set to _Deallocate_. That means, that a GPU VM is not _deleted_ but only _deallocated_. So you no longer have to pay for the compute resources but only for the disk that will not be deleted when using this mode. You can enable and disable this mode using the variables `linuxExecutionNodeDeallocate` and `gpuNodeDeallocate`. That means, you can not only configure this for the GPU node pool but also for the Execution node pool. As a default _Deallocate_ is used for both node pools. @@ -244,6 +248,7 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | Name | Source | Version | |------|--------|---------| | [simphera\_instance](#module\_simphera\_instance) | ./modules/simphera_instance | n/a | +| [gpu\_operator](#module\_gpu\_operator) | ./modules/gpu_operator | n/a | ## Resources @@ -289,6 +294,7 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config) | data source | | [azurerm_log_analytics_workspace.log-analytics-workspace](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source | | [azurerm_public_ip.aks_outgoing](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/public_ip) | data source | +| [helm_helm_release](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | ## Inputs @@ -300,6 +306,7 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | [gpuNodeDeallocate](#input\_gpuNodeDeallocate) | Configures whether the nodes for the gpu job execution are 'Deallocated (Stopped)' by the cluster auto scaler or 'Deleted'. | `bool` | `true` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `string` | `"Standard_NC16as_T4_v3"` | no | +| [gpuDriverVersion](#input\_gpuDriverVersion) | Sets the NVIDIA Driver Version used. | `string` | `"535.54.03"` | no | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | n/a | yes | | [keyVaultAuthorizedIpRanges](#input\_keyVaultAuthorizedIpRanges) | List of authorized IP address ranges that are granted access to the Key Vault, e.g. ["198.51.100.0/24"] | `set(string)` | `[]` | no | | [keyVaultPurgeProtection](#input\_keyVaultPurgeProtection) | Specifies whether the Key vault purge protection is enabled. | `bool` | `true` | no | diff --git a/modules/simphera_base/k8s.tf b/modules/simphera_base/k8s.tf index 99d98ac..06d596b 100644 --- a/modules/simphera_base/k8s.tf +++ b/modules/simphera_base/k8s.tf @@ -191,6 +191,7 @@ module "gpu-operator" { source = "./modules/gpu_operator" gpuDriverVersion = var.gpuDriverVersion + depends_on = [azurerm_kubernetes_cluster.aks] } output "kube_config" { diff --git a/modules/simphera_base/modules/gpu_operator/main.tf b/modules/simphera_base/modules/gpu_operator/main.tf index 4b8e1ce..f7acddc 100644 --- a/modules/simphera_base/modules/gpu_operator/main.tf +++ b/modules/simphera_base/modules/gpu_operator/main.tf @@ -3,9 +3,11 @@ terraform { } resource "helm_release" "gpu-operator" { - name = var.helmReleaseName - repository = "https://helm.ngc.nvidia.com/nvidia" - chart = "gpu-operator" - version = var.helmChartVersion - values = [templatefile("${path.module}/gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] + name = var.helmReleaseName + repository = "https://helm.ngc.nvidia.com/nvidia" + chart = "gpu-operator" + version = var.helmChartVersion + namespace = var.namespace + create_namespace = true + values = [templatefile("${path.module}/gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] } diff --git a/modules/simphera_base/modules/gpu_operator/variables.tf b/modules/simphera_base/modules/gpu_operator/variables.tf index 76e66f7..193660f 100644 --- a/modules/simphera_base/modules/gpu_operator/variables.tf +++ b/modules/simphera_base/modules/gpu_operator/variables.tf @@ -1,3 +1,7 @@ +variable "gpuDriverVersion" { + type = string +} + variable "helmReleaseName" { type = string default = "gpu-operator" @@ -8,6 +12,7 @@ variable "helmChartVersion" { default = "v23.9.0" } -variable "gpuDriverVersion" { - type = string +variable "namespace" { + type = string + default = "gpu-operator" } diff --git a/modules/simphera_base/modules/simphera_instance/postgresql.tf b/modules/simphera_base/modules/simphera_instance/postgresql.tf index 3abb370..078e973 100644 --- a/modules/simphera_base/modules/simphera_instance/postgresql.tf +++ b/modules/simphera_base/modules/simphera_instance/postgresql.tf @@ -74,14 +74,14 @@ resource "azurerm_postgresql_flexible_server_database" "keycloak" { name = "keycloak" server_id = azurerm_postgresql_flexible_server.postgresql-flexible.id charset = "UTF8" - collation = "en_US.UTF8" + collation = "en_US.utf8" } resource "azurerm_postgresql_flexible_server_database" "simphera" { name = "simphera" server_id = azurerm_postgresql_flexible_server.postgresql-flexible.id charset = "UTF8" - collation = "en_US.UTF8" + collation = "en_US.utf8" } resource "azurerm_postgresql_flexible_server_configuration" "pgcrypto" { From 2e3531c948a6b6a8bba78a4a66f6067aeb0a37b5 Mon Sep 17 00:00:00 2001 From: johannessc Date: Mon, 18 Dec 2023 11:09:15 +0100 Subject: [PATCH 5/6] chore: removed helm stuff from terraform --- README.md | 2 -- modules/simphera_base/k8s.tf | 8 ------ .../gpu_operator/gpu-operator-values.yaml | 28 ------------------- .../modules/gpu_operator/main.tf | 13 --------- .../modules/gpu_operator/variables.tf | 18 ------------ 5 files changed, 69 deletions(-) delete mode 100644 modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml delete mode 100644 modules/simphera_base/modules/gpu_operator/main.tf delete mode 100644 modules/simphera_base/modules/gpu_operator/variables.tf diff --git a/README.md b/README.md index 92603b6..87245ab 100644 --- a/README.md +++ b/README.md @@ -294,7 +294,6 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config) | data source | | [azurerm_log_analytics_workspace.log-analytics-workspace](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source | | [azurerm_public_ip.aks_outgoing](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/public_ip) | data source | -| [helm_helm_release](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | ## Inputs @@ -306,7 +305,6 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | [gpuNodeDeallocate](#input\_gpuNodeDeallocate) | Configures whether the nodes for the gpu job execution are 'Deallocated (Stopped)' by the cluster auto scaler or 'Deleted'. | `bool` | `true` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `string` | `"Standard_NC16as_T4_v3"` | no | -| [gpuDriverVersion](#input\_gpuDriverVersion) | Sets the NVIDIA Driver Version used. | `string` | `"535.54.03"` | no | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | n/a | yes | | [keyVaultAuthorizedIpRanges](#input\_keyVaultAuthorizedIpRanges) | List of authorized IP address ranges that are granted access to the Key Vault, e.g. ["198.51.100.0/24"] | `set(string)` | `[]` | no | | [keyVaultPurgeProtection](#input\_keyVaultPurgeProtection) | Specifies whether the Key vault purge protection is enabled. | `bool` | `true` | no | diff --git a/modules/simphera_base/k8s.tf b/modules/simphera_base/k8s.tf index 06d596b..b8a2038 100644 --- a/modules/simphera_base/k8s.tf +++ b/modules/simphera_base/k8s.tf @@ -186,14 +186,6 @@ resource "azurerm_kubernetes_cluster_node_pool" "gpu-execution-nodes" { } } -module "gpu-operator" { - count = var.gpuNodePool ? 1 : 0 - - source = "./modules/gpu_operator" - gpuDriverVersion = var.gpuDriverVersion - depends_on = [azurerm_kubernetes_cluster.aks] -} - output "kube_config" { value = azurerm_kubernetes_cluster.aks.kube_config sensitive = true diff --git a/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml b/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml deleted file mode 100644 index 8bceafc..0000000 --- a/modules/simphera_base/modules/gpu_operator/gpu-operator-values.yaml +++ /dev/null @@ -1,28 +0,0 @@ -operator: - defaultRuntime: containerd - -dcgmExporter: - enabled: false - -driver: - enabled: true - version: ${driver_version} - -toolkit: - enabled: true - -#TODO: make gpu nodePool taint in k8s.tf variable and then also fill the tolerations here with the corresponding values -daemonsets: - tolerations: - - key: purpose - value: gpu - operator: Equal - effect: NoSchedule - -node-feature-discovery: - worker: - tolerations: - - key: purpose - value: gpu - operator: Equal - effect: NoSchedule \ No newline at end of file diff --git a/modules/simphera_base/modules/gpu_operator/main.tf b/modules/simphera_base/modules/gpu_operator/main.tf deleted file mode 100644 index f7acddc..0000000 --- a/modules/simphera_base/modules/gpu_operator/main.tf +++ /dev/null @@ -1,13 +0,0 @@ -terraform { - required_version = ">= 1.0.0" -} - -resource "helm_release" "gpu-operator" { - name = var.helmReleaseName - repository = "https://helm.ngc.nvidia.com/nvidia" - chart = "gpu-operator" - version = var.helmChartVersion - namespace = var.namespace - create_namespace = true - values = [templatefile("${path.module}/gpu-operator-values.yaml", { driver_version = var.gpuDriverVersion })] -} diff --git a/modules/simphera_base/modules/gpu_operator/variables.tf b/modules/simphera_base/modules/gpu_operator/variables.tf deleted file mode 100644 index 193660f..0000000 --- a/modules/simphera_base/modules/gpu_operator/variables.tf +++ /dev/null @@ -1,18 +0,0 @@ -variable "gpuDriverVersion" { - type = string -} - -variable "helmReleaseName" { - type = string - default = "gpu-operator" -} - -variable "helmChartVersion" { - type = string - default = "v23.9.0" -} - -variable "namespace" { - type = string - default = "gpu-operator" -} From cc856b85558d96eb6500327cf2af1f4d7ce591c1 Mon Sep 17 00:00:00 2001 From: johannessc Date: Mon, 18 Dec 2023 15:27:33 +0100 Subject: [PATCH 6/6] chore: removed old stuff --- README.md | 1 - modules/simphera_base/variables.tf | 6 ------ 2 files changed, 7 deletions(-) diff --git a/README.md b/README.md index 87245ab..f8ee12a 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,6 @@ As a next step you have to deploy SIMPHERA to the Kubernetes cluster by using th | Name | Source | Version | |------|--------|---------| | [simphera\_instance](#module\_simphera\_instance) | ./modules/simphera_instance | n/a | -| [gpu\_operator](#module\_gpu\_operator) | ./modules/gpu_operator | n/a | ## Resources diff --git a/modules/simphera_base/variables.tf b/modules/simphera_base/variables.tf index f74905a..185cbb3 100644 --- a/modules/simphera_base/variables.tf +++ b/modules/simphera_base/variables.tf @@ -86,12 +86,6 @@ variable "gpuNodeDeallocate" { default = true } -variable "gpuDriverVersion" { - type = string - description = "GPU Driver Version that the gpu-operator uses." - default = "535.54.03" -} - variable "ssh_public_key_path" { type = string description = "Path to the public SSH key to be used for the kubernetes nodes."