From a22c29f4f0fa089ca405e2fe72868e8ac456b030 Mon Sep 17 00:00:00 2001 From: Bo Guan Date: Thu, 16 May 2024 20:30:08 -0500 Subject: [PATCH 1/4] qdrant --- qdrant.tf | 107 +++++++++++++ test/helm-values/argo-values.yaml | 5 + test/helm-values/spark-operator-values.yaml | 2 +- test/main.tf | 158 ++++++++++++++++++-- variables.tf | 15 ++ 5 files changed, 272 insertions(+), 15 deletions(-) create mode 100644 qdrant.tf create mode 100644 test/helm-values/argo-values.yaml diff --git a/qdrant.tf b/qdrant.tf new file mode 100644 index 0000000..af9fa3a --- /dev/null +++ b/qdrant.tf @@ -0,0 +1,107 @@ +locals { + qdrant_name = "qdrant" + qdrant_repository = "https://qdrant.github.io/qdrant-helm" + qdrant_version = "0.7.6" + + qdrant_namespace = try(var.qdrant_helm_config["namespace"], local.qdrant_name) + qdrant_set_values = [] + + qdrant_default_values = <<-EOT +replicaCount: 3 + +env: + - name: QDRANT__TELEMETRY_DISABLED + value: true + +service: + type: LoadBalancer + +resources: + limits: + cpu: 200m + memory: 1Gi + requests: + cpu: 200m + memory: 1Gi + +persistence: + storageClassName: gp2 + +metrics: + serviceMonitor: + enabled: true + +apiKey: false + +EOT + + qdrant_merged_values_yaml = yamlencode(merge( + yamldecode(local.qdrant_default_values), + try(yamldecode(var.qdrant_helm_config.values[0]), {}) + )) + +} + +resource "helm_release" "qdrant" { + count = var.enable_qdrant ? 1 : 0 + + name = try(var.qdrant_helm_config["name"], local.qdrant_name) + repository = try(var.qdrant_helm_config["repository"], local.qdrant_repository) + chart = try(var.qdrant_helm_config["chart"], local.qdrant_name) + version = try(var.qdrant_helm_config["version"], local.qdrant_version) + timeout = try(var.qdrant_helm_config["timeout"], 300) + values = [local.qdrant_merged_values_yaml] + create_namespace = try(var.qdrant_helm_config["create_namespace"], true) + namespace = local.qdrant_namespace + lint = try(var.qdrant_helm_config["lint"], false) + description = try(var.qdrant_helm_config["description"], "") + repository_key_file = try(var.qdrant_helm_config["repository_key_file"], "") + repository_cert_file = try(var.qdrant_helm_config["repository_cert_file"], "") + repository_username = try(var.qdrant_helm_config["repository_username"], "") + repository_password = try(var.qdrant_helm_config["repository_password"], "") + verify = try(var.qdrant_helm_config["verify"], false) + keyring = try(var.qdrant_helm_config["keyring"], "") + disable_webhooks = try(var.qdrant_helm_config["disable_webhooks"], false) + reuse_values = try(var.qdrant_helm_config["reuse_values"], false) + reset_values = try(var.qdrant_helm_config["reset_values"], false) + force_update = try(var.qdrant_helm_config["force_update"], false) + recreate_pods = try(var.qdrant_helm_config["recreate_pods"], false) + cleanup_on_fail = try(var.qdrant_helm_config["cleanup_on_fail"], false) + max_history = try(var.qdrant_helm_config["max_history"], 0) + atomic = try(var.qdrant_helm_config["atomic"], false) + skip_crds = try(var.qdrant_helm_config["skip_crds"], false) + render_subchart_notes = try(var.qdrant_helm_config["render_subchart_notes"], true) + disable_openapi_validation = try(var.qdrant_helm_config["disable_openapi_validation"], false) + wait = try(var.qdrant_helm_config["wait"], true) + wait_for_jobs = try(var.qdrant_helm_config["wait_for_jobs"], false) + dependency_update = try(var.qdrant_helm_config["dependency_update"], false) + replace = try(var.qdrant_helm_config["replace"], false) + + postrender { + binary_path = try(var.qdrant_helm_config["postrender"], "") + } + + dynamic "set" { + iterator = each_item + for_each = distinct(concat(try(var.qdrant_helm_config.set, []), local.qdrant_set_values)) + + content { + name = each_item.value.name + value = each_item.value.value + type = try(each_item.value.type, null) + } + } + + dynamic "set_sensitive" { + iterator = each_item + for_each = try(var.qdrant_helm_config["set_sensitive"], []) + + content { + name = each_item.value.name + value = each_item.value.value + type = try(each_item.value.type, null) + } + } +} + + diff --git a/test/helm-values/argo-values.yaml b/test/helm-values/argo-values.yaml new file mode 100644 index 0000000..d846cf1 --- /dev/null +++ b/test/helm-values/argo-values.yaml @@ -0,0 +1,5 @@ +server: + autoscaling: + enabled: true + minReplicas: 1 + serviceType: LoadBalancer \ No newline at end of file diff --git a/test/helm-values/spark-operator-values.yaml b/test/helm-values/spark-operator-values.yaml index be50a1e..e098fc1 100644 --- a/test/helm-values/spark-operator-values.yaml +++ b/test/helm-values/spark-operator-values.yaml @@ -1,7 +1,7 @@ replicaCount: 1 # -- Set this if running spark jobs in a different namespace than the operator -sparkJobNamespace: "spark-team-a" +#sparkJobNamespace: "spark-team-a" # -- Operator concurrency, higher values might increase memory usage controllerThreads: 10 diff --git a/test/main.tf b/test/main.tf index 61db548..dab60af 100644 --- a/test/main.tf +++ b/test/main.tf @@ -2,6 +2,13 @@ provider "aws" { region = local.region } +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + provider "kubernetes" { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) @@ -14,17 +21,22 @@ provider "kubernetes" { } } +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + provider "helm" { kubernetes { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } + token = data.aws_eks_cluster_auth.this.token + #exec { + # api_version = "client.authentication.k8s.io/v1beta1" + # command = "aws" + # # This requires the awscli to be installed locally where Terraform is executed + # args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + #} } } @@ -35,6 +47,10 @@ data "aws_ecr_authorization_token" "token" { registry_id = "895885662937" } +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + data "aws_availability_zones" "available" {} locals { @@ -59,8 +75,15 @@ module "doeks_data_addons" { enable_aws_neuron_device_plugin = true enable_emr_spark_operator = true enable_flink_operator = true + flink_operator_helm_config = { + version = "1.8.0" + } enable_jupyterhub = true enable_kubecost = true + kubecost_helm_config = { + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } enable_nvidia_gpu_operator = true enable_kuberay_operator = true enable_spark_history_server = true @@ -76,30 +99,137 @@ module "doeks_data_addons" { } enable_strimzi_kafka_operator = true enable_yunikorn = true + + enable_qdrant = true + +} + + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.0" #ensure to update this to the latest/desired version + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + enable_aws_load_balancer_controller = true + enable_kube_prometheus_stack = true + enable_metrics_server = true + enable_cert_manager = true + + enable_karpenter = true + karpenter_enable_spot_termination = true + karpenter_node = { + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + karpenter = { + chart_version = "v0.34.0" + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + + enable_argo_workflows = true + argo_workflows = { + name = "argo-workflows" + chart_version = "0.41.4" + repository = "https://argoproj.github.io/argo-helm" + namespace = "argo-workflows" + values = [templatefile("${path.module}/helm-values/argo-values.yaml", {})] + } +} + +module "ebs_csi_driver_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.20" + role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") + attach_ebs_csi_policy = true + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + tags = local.tags } # checkov:skip=CKV_TF_1 #tfsec:ignore:aws-eks-enable-control-plane-logging module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 19.13" + version = "~> 20.8" + depends_on = [module.vpc] cluster_name = local.name - cluster_version = "1.26" + cluster_version = "1.29" cluster_endpoint_public_access = true + cluster_endpoint_private_access = true vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - manage_aws_auth_configmap = true + cluster_addons = { + coredns = { + most_recent = true + } + kube-proxy = { + most_recent = true + } + vpc-cni = { + most_recent = true + } + aws-ebs-csi-driver = { + most_recent = true + service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn + } + } + + eks_managed_node_group_defaults = { + iam_role_additional_policies = { + # Not required, but used in the example to access the nodes to inspect mounted volumes + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + enable_cluster_creator_admin_permissions = true eks_managed_node_groups = { - initial = { - instance_types = ["m5.xlarge"] + # We recommend to have a MNG to place your critical workloads and add-ons + # Then rely on Karpenter to scale your workloads + # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners + core_node_group = { + name = "core-node-group" + description = "EKS managed node group example launch template" + subnet_ids = module.vpc.private_subnets + + min_size = 1 + max_size = 9 + desired_size = 7 + + force_update_version = true + instance_types = ["m6g.xlarge"] + ami_type = "AL2_ARM_64" + ebs_optimized = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } + + labels = { + WorkerType = "ON_DEMAND" + NodeGroupType = "core" + } - min_size = 4 - max_size = 10 - desired_size = 4 + tags = { + Name = "core-node-grp" + } } } diff --git a/variables.tf b/variables.tf index 3e8d0d4..446e7ea 100644 --- a/variables.tf +++ b/variables.tf @@ -364,3 +364,18 @@ variable "superset_helm_config" { type = any default = {} } + +#--------------------------------------------------- +# Qdrant +#--------------------------------------------------- +variable "enable_qdrant" { + description = "Enable Qdrant Vector Database add-on" + type = bool + default = false +} + +variable "qdrant_helm_config" { + description = "Helm configuration for Qdrant" + type = any + default = {} +} From eccb10a6f36478d190afc4d3a6779a72aed27b8c Mon Sep 17 00:00:00 2001 From: Bo Guan Date: Thu, 23 May 2024 14:38:38 -0500 Subject: [PATCH 2/4] example --- test/helm-values/kube-prometheus-values.yaml | 4 -- test/helm-values/spark-operator-values.yaml | 2 +- test/main.tf | 41 +++++++++++++++----- 3 files changed, 32 insertions(+), 15 deletions(-) delete mode 100644 test/helm-values/kube-prometheus-values.yaml diff --git a/test/helm-values/kube-prometheus-values.yaml b/test/helm-values/kube-prometheus-values.yaml deleted file mode 100644 index 9df713d..0000000 --- a/test/helm-values/kube-prometheus-values.yaml +++ /dev/null @@ -1,4 +0,0 @@ -prometheus: - prometheusSpec: - serviceMonitorSelectorNilUsesHelmValues: false - \ No newline at end of file diff --git a/test/helm-values/spark-operator-values.yaml b/test/helm-values/spark-operator-values.yaml index e098fc1..be50a1e 100644 --- a/test/helm-values/spark-operator-values.yaml +++ b/test/helm-values/spark-operator-values.yaml @@ -1,7 +1,7 @@ replicaCount: 1 # -- Set this if running spark jobs in a different namespace than the operator -#sparkJobNamespace: "spark-team-a" +sparkJobNamespace: "spark-team-a" # -- Operator concurrency, higher values might increase memory usage controllerThreads: 10 diff --git a/test/main.tf b/test/main.tf index e8af2d2..550b21e 100644 --- a/test/main.tf +++ b/test/main.tf @@ -70,11 +70,10 @@ module "doeks_data_addons" { source = "../" oidc_provider_arn = module.eks.oidc_provider_arn - enable_airflow = false + enable_airflow = true enable_aws_efa_k8s_device_plugin = true enable_aws_neuron_device_plugin = true - - enable_emr_spark_operator = false + enable_emr_spark_operator = true emr_spark_operator_helm_config = { repository_username = data.aws_ecr_authorization_token.token.user_name repository_password = data.aws_ecr_authorization_token.token.password @@ -85,28 +84,43 @@ module "doeks_data_addons" { version = "1.8.0" } enable_jupyterhub = true - enable_kubecost = false // kubecost not working with prometheus stack as node-exporter already exists + enable_kubecost = true kubecost_helm_config = { repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password + values = [ + <<-EOT + global: + prometheus: + fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090 + enabled: false + EOT + ] } + enable_nvidia_gpu_operator = true - enable_kuberay_operator = false - enable_spark_history_server = false + enable_kuberay_operator = true + kuberay_operator_helm_config = { + version = "1.1.0" + } + enable_spark_history_server = true enable_spark_operator = true # With custom values spark_operator_helm_config = { values = [templatefile("${path.module}/helm-values/spark-operator-values.yaml", {})] } - enable_strimzi_kafka_operator = false - enable_yunikorn = false + + enable_strimzi_kafka_operator = true + enable_yunikorn = true + yunikorn_helm_config = { + version = "1.5.0" + } enable_qdrant = true } - module "eks_blueprints_addons" { source = "aws-ia/eks-blueprints-addons/aws" version = "~> 1.0" #ensure to update this to the latest/desired version @@ -118,8 +132,15 @@ module "eks_blueprints_addons" { enable_aws_load_balancer_controller = true enable_kube_prometheus_stack = true kube_prometheus_stack = { - values = [templatefile("${path.module}/helm-values/kube-prometheus-values.yaml", {})] + values = [ + <<-EOT + prometheus: + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + EOT + ] } + enable_metrics_server = true enable_cert_manager = true From 6051115d1a7d6fde43ad7a9970078720745e1898 Mon Sep 17 00:00:00 2001 From: Bo Guan Date: Thu, 23 May 2024 14:52:11 -0500 Subject: [PATCH 3/4] precommit --- qdrant.tf | 6 ++---- test/main.tf | 60 ++++++++++++++++++++++++++-------------------------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/qdrant.tf b/qdrant.tf index af9fa3a..873fb99 100644 --- a/qdrant.tf +++ b/qdrant.tf @@ -3,7 +3,7 @@ locals { qdrant_repository = "https://qdrant.github.io/qdrant-helm" qdrant_version = "0.7.6" - qdrant_namespace = try(var.qdrant_helm_config["namespace"], local.qdrant_name) + qdrant_namespace = try(var.qdrant_helm_config["namespace"], local.qdrant_name) qdrant_set_values = [] qdrant_default_values = <<-EOT @@ -30,7 +30,7 @@ persistence: metrics: serviceMonitor: enabled: true - + apiKey: false EOT @@ -103,5 +103,3 @@ resource "helm_release" "qdrant" { } } } - - diff --git a/test/main.tf b/test/main.tf index 550b21e..3951047 100644 --- a/test/main.tf +++ b/test/main.tf @@ -30,7 +30,7 @@ provider "helm" { host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token + token = data.aws_eks_cluster_auth.this.token #exec { # api_version = "client.authentication.k8s.io/v1beta1" # command = "aws" @@ -79,12 +79,12 @@ module "doeks_data_addons" { repository_password = data.aws_ecr_authorization_token.token.password } - enable_flink_operator = true - flink_operator_helm_config = { - version = "1.8.0" + enable_flink_operator = true + flink_operator_helm_config = { + version = "1.8.0" } - enable_jupyterhub = true - enable_kubecost = true + enable_jupyterhub = true + enable_kubecost = true kubecost_helm_config = { repository_username = data.aws_ecrpublic_authorization_token.token.user_name repository_password = data.aws_ecrpublic_authorization_token.token.password @@ -98,17 +98,17 @@ module "doeks_data_addons" { ] } - enable_nvidia_gpu_operator = true - enable_kuberay_operator = true + enable_nvidia_gpu_operator = true + enable_kuberay_operator = true kuberay_operator_helm_config = { version = "1.1.0" } - enable_spark_history_server = true - enable_spark_operator = true + enable_spark_history_server = true + enable_spark_operator = true # With custom values spark_operator_helm_config = { - values = [templatefile("${path.module}/helm-values/spark-operator-values.yaml", {})] + values = [templatefile("${path.module}/helm-values/spark-operator-values.yaml", {})] } enable_strimzi_kafka_operator = true @@ -117,20 +117,20 @@ module "doeks_data_addons" { version = "1.5.0" } - enable_qdrant = true - + enable_qdrant = true + } module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.0" #ensure to update this to the latest/desired version + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.0" #ensure to update this to the latest/desired version cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint cluster_version = module.eks.cluster_version oidc_provider_arn = module.eks.oidc_provider_arn - enable_aws_load_balancer_controller = true - enable_kube_prometheus_stack = true + enable_aws_load_balancer_controller = true + enable_kube_prometheus_stack = true kube_prometheus_stack = { values = [ <<-EOT @@ -141,9 +141,9 @@ module "eks_blueprints_addons" { ] } - enable_metrics_server = true - enable_cert_manager = true - + enable_metrics_server = true + enable_cert_manager = true + } module "ebs_csi_driver_irsa" { @@ -163,13 +163,13 @@ module "ebs_csi_driver_irsa" { # checkov:skip=CKV_TF_1 #tfsec:ignore:aws-eks-enable-control-plane-logging module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 20.8" + source = "terraform-aws-modules/eks/aws" + version = "~> 20.8" depends_on = [module.vpc] - cluster_name = local.name - cluster_version = "1.29" - cluster_endpoint_public_access = true + cluster_name = local.name + cluster_version = "1.29" + cluster_endpoint_public_access = true cluster_endpoint_private_access = true vpc_id = module.vpc.vpc_id @@ -186,7 +186,7 @@ module "eks" { most_recent = true } aws-ebs-csi-driver = { - most_recent = true + most_recent = true service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn } } @@ -213,10 +213,10 @@ module "eks" { max_size = 9 desired_size = 7 - force_update_version = true - instance_types = ["m6g.xlarge"] - ami_type = "AL2_ARM_64" - ebs_optimized = true + force_update_version = true + instance_types = ["m6g.xlarge"] + ami_type = "AL2_ARM_64" + ebs_optimized = true block_device_mappings = { xvda = { device_name = "/dev/xvda" From d81bfbdbaf10db104d736050959bc6dda40952a9 Mon Sep 17 00:00:00 2001 From: Bo Guan Date: Mon, 24 Jun 2024 10:15:34 -0500 Subject: [PATCH 4/4] cli ver --- test/versions.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/versions.tf b/test/versions.tf index e2fe32c..a00f4ab 100644 --- a/test/versions.tf +++ b/test/versions.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.0" + required_version = ">= 1.2" required_providers { aws = {