From ea9fe27c0b20b9494115c9cfdfc1bb60e32c2691 Mon Sep 17 00:00:00 2001 From: Angelo Fenoglio Date: Tue, 21 Jan 2025 09:57:50 -0300 Subject: [PATCH] Feature | Merge EKS clusters into one (#659) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Angelo, the author of this PR, shared the following message with me: "This is now fully tested, it just needs approval to be merged. I won’t be available tomorrow, but feel free to merge it on my behalf if everything looks good." Since it’s ready to go and fully tested, I’ll proceed to merge the PR. * Drop config for argocd exclusive nodes * Drop unused namespaces * Upgrade Argo and add nodeSelector and tolerations * Typos * Configure Slack notifications for ArgoCD * Change switchboard layout * Fix cert manager reference * Add Alertmanager and Grafana implementation to Kube Prom Stack * Make Fluent-Bit use IRSA and OpenSearch * Add IAM roles for grafana and fluent-bit * Add Goldilocks * Add Gatus * Fix ArgoCD definition * Conform keda to switchboard * Bump terraform version CC: @binbashar/leverage-ref-architecture-aws-admin @binbashar/leverage-ref-architecture-aws-dev --- .../us-east-1/k8s-eks-demoapps/README.md | 2 +- .../cluster/eks-workers-managed.tf | 16 - .../identities/ids_fluentbit.tf | 44 +++ .../identities/ids_grafana.tf | 70 ++++ .../k8s-eks-demoapps/identities/locals.tf | 2 + .../k8s-eks-demoapps/identities/outputs.tf | 10 + .../k8s-components/chart-values/argo-cd.yaml | 309 ++++++++++++++++- .../chart-values/argo-rollouts.yaml | 17 +- .../chart-values/argocd-image-updater.yaml | 4 +- .../chart-values/fluentbit.yaml | 28 +- .../k8s-components/chart-values/gatus.yaml | 52 +++ .../chart-values/goldilocks.yaml | 23 ++ .../chart-values/kube-prometheus-stack.yaml | 123 +++++-- .../k8s-components/cicd-argo.tf | 59 ++-- .../k8s-components/common-variables.tf | 134 +------- .../k8s-eks-demoapps/k8s-components/config.tf | 2 +- .../identity-external-prometheus.tf | 6 +- .../identity-grafana-kubegraf.tf | 6 +- .../k8s-eks-demoapps/k8s-components/locals.tf | 7 +- .../k8s-components/monitoring-alerts.tf | 2 +- .../k8s-components/monitoring-cost.tf | 2 +- .../k8s-components/monitoring-logging.tf | 12 +- .../k8s-components/monitoring-metrics.tf | 48 ++- .../k8s-components/monitoring-other.tf | 26 +- .../k8s-components/namespaces.tf | 57 +--- .../k8s-components/networking-dns.tf | 4 +- .../k8s-components/networking-ingress.tf | 10 +- .../k8s-components/scaling.tf | 34 +- .../k8s-components/security.tf | 12 +- .../k8s-components/terraform.tfvars | 158 +++++++-- .../k8s-components/variables.tf | 319 +++++++++--------- .../k8s-eks-demoapps/network/config.tf | 2 +- .../chart-values/kube-prometheus-stack.yaml | 6 +- build.env | 2 +- shared/us-east-1/secrets-manager/secrets.tf | 25 +- 35 files changed, 1121 insertions(+), 512 deletions(-) create mode 100644 apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_fluentbit.tf create mode 100644 apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_grafana.tf create mode 100644 apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/gatus.yaml create mode 100644 apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/goldilocks.yaml mode change 100644 => 120000 apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/README.md b/apps-devstg/us-east-1/k8s-eks-demoapps/README.md index 71398f053..62b07b720 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/README.md +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/README.md @@ -100,7 +100,7 @@ The EKS CLUSTER layers need to be orchestrated in the following order: 6. In the output you should see the credentials you need to talk to Kubernetes API via kubectl (or other clients). ```shell - apps-devstg//k8s-eks-v1.17/cluster$ leverage terraform output + apps-devstg//k8s-eks-demoapps/cluster$ leverage terraform output ... kubectl_config = apiVersion: v1 diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/cluster/eks-workers-managed.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/cluster/eks-workers-managed.tf index fd5c6c7a4..6900fd456 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/cluster/eks-workers-managed.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/cluster/eks-workers-managed.tf @@ -148,22 +148,6 @@ module "cluster" { # } # } # } - # argocd = { - # desired_size = 1 - # max_size = 2 - # min_size = 1 - # capacity_type = "SPOT" - # instance_types = ["t3.medium"] - - # labels = merge(local.tags, { "stack" = "argocd" }) - # taints = { - # dedicated_argocd = { - # key = "stack" - # value = "argocd" - # effect = "NO_SCHEDULE" - # } - # } - # } } # Configure which roles, users and accounts can access the k8s api diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_fluentbit.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_fluentbit.tf new file mode 100644 index 000000000..323875688 --- /dev/null +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_fluentbit.tf @@ -0,0 +1,44 @@ +# +# Fluent-bit Roles & Policies +# +module "role_fluent_bit" { + source = "github.com/binbashar/terraform-aws-iam.git//modules/iam-assumable-role-with-oidc?ref=v5.2.0" + + providers = { + aws = aws.shared + } + + create_role = true + role_name = "${local.environment}-fluent-bit" + provider_url = replace(data.terraform_remote_state.cluster.outputs.cluster_oidc_issuer_url, "https://", "") + + role_policy_arns = [ + aws_iam_policy.fluent_bit.arn + ] + oidc_fully_qualified_subjects = [ + "system:serviceaccount:monitoring-logging:fluent-bit" + ] + + tags = local.tags_fluent_bit +} + +resource "aws_iam_policy" "fluent_bit" { + provider = aws.shared + name = "${local.environment}-fluent-bit" + description = "Fluent Bit" + tags = local.tags_fluent_bit + policy = <<-EOF + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "es:ESHttp*" + ], + "Resource": "arn:aws:es:${var.region}:${var.accounts.shared.id}:domain/example-domain" + } + ] + } + EOF +} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_grafana.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_grafana.tf new file mode 100644 index 000000000..18c7de978 --- /dev/null +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/ids_grafana.tf @@ -0,0 +1,70 @@ +# +# Grafana Roles & Policies +# +module "role_grafana" { + source = "github.com/binbashar/terraform-aws-iam.git//modules/iam-assumable-role-with-oidc?ref=v5.2.0" + + create_role = true + role_name = "${local.environment}-grafana" + provider_url = replace(data.terraform_remote_state.cluster.outputs.cluster_oidc_issuer_url, "https://", "") + + role_policy_arns = [ + aws_iam_policy.grafana.arn + ] + oidc_fully_qualified_subjects = [ + "system:serviceaccount:monitoring-metrics:grafana" + ] + + tags = local.tags_grafana +} + +resource "aws_iam_policy" "grafana" { + name = "${local.environment}-grafana" + description = "Grafana permissions for fetching data from Cloudwatch" + tags = local.tags_grafana + policy = <<-EOF + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowReadingMetricsFromCloudWatch", + "Effect": "Allow", + "Action": [ + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:DescribeAlarmHistory", + "cloudwatch:DescribeAlarms", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricData", + "cloudwatch:GetInsightRuleReport" + ], + "Resource": "*" + }, + { + "Sid": "AllowReadingLogsFromCloudWatch", + "Effect": "Allow", + "Action": [ + "logs:DescribeLogGroups", + "logs:GetLogGroupFields", + "logs:StartQuery", + "logs:StopQuery", + "logs:GetQueryResults", + "logs:GetLogEvents" + ], + "Resource": "*" + }, + { + "Sid": "AllowReadingTagsInstancesRegionsFromEC2", + "Effect": "Allow", + "Action": ["ec2:DescribeTags", "ec2:DescribeInstances", "ec2:DescribeRegions"], + "Resource": "*" + }, + { + "Sid": "AllowReadingResourcesForTags", + "Effect": "Allow", + "Action": "tag:GetResources", + "Resource": "*" + } + ] + } + EOF +} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/locals.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/locals.tf index 18478f2cb..12da578e3 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/locals.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/locals.tf @@ -14,6 +14,8 @@ locals { tags_externaldns_public = merge(local.tags, { Subject = "externaldns-public" }) tags_aws_lb_controller = merge(local.tags, { Subject = "aws-lb-controller" }) tags_external_secrets = merge(local.tags, { Subject = "external-secrets" }) + tags_grafana = merge(local.tags, { Subject = "grafana" }) + tags_fluent_bit = merge(local.tags, { Subject = "fluent-bit" }) tags_argo_image_updater = merge(local.tags, { Subject = "argo-image-updater" }) tags_vpc_cni = merge(local.tags, { Subject = "vpc-cni" }) tags_ebs_csi = merge(local.tags, { Subject = "ebs-csi" }) diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/outputs.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/outputs.tf index 86a721e68..372af4b86 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/identities/outputs.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/identities/outputs.tf @@ -28,6 +28,16 @@ output "external_secrets_role_arn" { value = module.role_external_secrets.iam_role_arn } +output "grafana_role_arn" { + description = "Grafana Role ARN" + value = module.role_grafana.iam_role_arn +} + +output "fluent_bit_role_arn" { + description = "Fluent Bit Role ARN" + value = module.role_fluent_bit.iam_role_arn +} + output "argo_cd_image_updater_role_arn" { description = "Argo CD Image Updater Role ARN" value = module.role_argo_cd_image_updater.iam_role_arn diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-cd.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-cd.yaml index 88aa299c1..1350184d6 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-cd.yaml +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-cd.yaml @@ -1,13 +1,22 @@ # --------------------------------------------------------- -# ArgoCD Repo Server +# Global shared configs # --------------------------------------------------------- global: domain: ${argoHost} + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} + +# --------------------------------------------------------- +# ArgoCD configs +# --------------------------------------------------------- configs: cm: exec.enabled: "${enableWebTerminal}" +# --------------------------------------------------------- +# Repository Server +# --------------------------------------------------------- repoServer: env: # Increase the default timeout applied when the repo-server executes config management tools @@ -15,6 +24,9 @@ repoServer: - name: ARGOCD_EXEC_TIMEOUT value: 3m0s +# --------------------------------------------------------- +# Server +# --------------------------------------------------------- server: ingress: enabled: true @@ -28,9 +40,294 @@ server: nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" tls: true - extensions: - enabled: false - contents: - - name: argo-rollouts - url: https://github.com/argoproj-labs/rollout-extension/releases/download/v0.2.0/extension.tar +# --------------------------------------------------------- +# Notifications Controller +# --------------------------------------------------------- +notifications: + enabled: ${enableNotifications} + + secret: + create: ${enableNotifications} + items: + slack-token: ${slackNotificationsAppToken} + + # -- Notification services + notifiers: + service.slack: | + token: $slack-token + username: ArgoCD-PRD + icon: https://raw.githubusercontent.com/argoproj/argo-cd/master/docs/assets/logo.png + + # -- Centrally managed global application subscriptions + subscriptions: + - recipients: + - slack:${slackNotificationsChannel} + triggers: + - on-deployed + - on-health-degraded + - on-sync-failed + - on-sync-status-unknown + - on-sync-succeeded + + # -- Templates to generate the notification content + templates: + template.app-created: | + message: Application *{{(call .strings.ToUpper .app.metadata.name)}}* has been created. + template.app-deleted: | + message: Application *{{(call .strings.ToUpper .app.metadata.name)}}* has been deleted. + template.app-deployed: | + message: | + {{if eq .serviceType "slack"}}:white_check_mark:{{end}} Application *{{(call .strings.ToUpper .app.metadata.name)}}* is now running new version of deployments manifests. + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#18be52", + "fields": [ + { + "title": "Sync Status", + "value": "{{.app.status.sync.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + }, + { + "title": "Revision", + "value": "{{.app.status.sync.revision}}", + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + template.app-health-degraded: | + message: | + {{if eq .serviceType "slack"}}:exclamation:{{end}} Application *{{(call .strings.ToUpper .app.metadata.name)}}* has degraded. + Application details: {{.context.argocdUrl}}/applications/{{.app.metadata.name}}. + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#f4c030", + "fields": [ + { + "title": "Health Status", + "value": "{{.app.status.health.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + template.app-sync-failed: | + message: | + {{if eq .serviceType "slack"}}:exclamation:{{end}} The sync operation of application *{{(call .strings.ToUpper .app.metadata.name)}}* has failed at {{.app.status.operationState.finishedAt}} with the following error: {{.app.status.operationState.message}} + Sync operation details are available at: {{.context.argocdUrl}}/applications/{{.app.metadata.name}}?operation=true . + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#E96D76", + "fields": [ + { + "title": "Sync Status", + "value": "{{.app.status.sync.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + template.app-sync-running: | + message: | + The sync operation of application *{{(call .strings.ToUpper .app.metadata.name)}}* has started at {{.app.status.operationState.startedAt}}. + Sync operation details are available at: {{.context.argocdUrl}}/applications/{{.app.metadata.name}}?operation=true . + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#0DADEA", + "fields": [ + { + "title": "Sync Status", + "value": "{{.app.status.sync.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + template.app-sync-status-unknown: | + message: | + {{if eq .serviceType "slack"}}:exclamation:{{end}} Application *{{(call .strings.ToUpper .app.metadata.name)}}* sync is 'Unknown'. + Application details: {{.context.argocdUrl}}/applications/{{.app.metadata.name}}. + {{if ne .serviceType "slack"}} + {{range $c := .app.status.conditions}} + * {{$c.message}} + {{end}} + {{end}} + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#E96D76", + "fields": [ + { + "title": "Sync Status", + "value": "{{.app.status.sync.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + template.app-sync-succeeded: | + message: | + {{if eq .serviceType "slack"}}:white_check_mark:{{end}} Application *{{(call .strings.ToUpper .app.metadata.name)}}* has been successfully synced at {{.app.status.operationState.finishedAt}}. + Sync operation details are available at: {{.context.argocdUrl}}/applications/{{.app.metadata.name}}?operation=true . + slack: + attachments: | + [{ + "author_name": "{{(call .strings.ToUpper .app.metadata.name)}}", + "author_link": "https://{{.context.argocdUrl}}/applications/{{.app.metadata.name}}", + "color": "#18be52", + "fields": [ + { + "title": "Sync Status", + "value": "{{.app.status.sync.status}}", + "short": true + }, + { + "title": {{- if .app.spec.source }} "Repository" {{- else if .app.spec.sources }} "Repositories" {{- end }}, + "value": {{- if .app.spec.source }} ":arrow_heading_up: {{ .app.spec.source.repoURL }}" {{- else if .app.spec.sources }} "{{- range $index, $source := .app.spec.sources }}{{ if $index }}\n{{ end }}:arrow_heading_up: {{ $source.repoURL }}{{- end }}" {{- end }}, + "short": true + } + {{range $index, $c := .app.status.conditions}} + , + { + "title": "{{$c.type}}", + "value": "{{$c.message}}", + "short": true + } + {{end}} + ] + }] + deliveryPolicy: Post + notifyBroadcast: false + + # -- Conditions when the notifications should be sent + triggers: + trigger.on-created: | + - description: Application is created. + oncePer: app.metadata.name + send: + - app-created + when: "true" + trigger.on-deleted: | + - description: Application is deleted. + oncePer: app.metadata.name + send: + - app-deleted + when: app.metadata.deletionTimestamp != nil + trigger.on-deployed: | + - description: Application is synced and healthy. Triggered once per commit. + oncePer: app.status.operationState.syncResult.revision + send: + - app-deployed + when: app.status.operationState != nil and app.status.operationState.phase in ['Succeeded'] + and app.status.health.status == 'Healthy' + trigger.on-health-degraded: | + - description: Application has degraded + send: + - app-health-degraded + when: app.status.health.status == 'Degraded' + trigger.on-sync-failed: | + - description: Application syncing has failed + send: + - app-sync-failed + when: app.status.operationState != nil and app.status.operationState.phase in ['Error', + 'Failed'] + trigger.on-sync-running: | + - description: Application is being synced + send: + - app-sync-running + when: app.status.operationState != nil and app.status.operationState.phase in ['Running'] + trigger.on-sync-status-unknown: | + - description: Application status is 'Unknown' + send: + - app-sync-status-unknown + when: app.status.sync.status == 'Unknown' + trigger.on-sync-succeeded: | + - description: Application syncing has succeeded + send: + - app-sync-succeeded + when: app.status.operationState != nil and app.status.operationState.phase in ['Succeeded'] diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-rollouts.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-rollouts.yaml index 5013f02a1..9debd009f 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-rollouts.yaml +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argo-rollouts.yaml @@ -1,8 +1,21 @@ +# --------------------------------------------------------- +# Controller configs +# --------------------------------------------------------- +controller: + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} + +# --------------------------------------------------------- +# Dashboard configs +# --------------------------------------------------------- dashboard: - enabled: true + enabled: ${enableDashboard} + + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} ingress: - enabled: true + enabled: ${enableDashboard} annotations: kubernetes.io/tls-acme: "true" # Associate this ingress with the following ingress controller diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argocd-image-updater.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argocd-image-updater.yaml index 452def9b9..342467e8d 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argocd-image-updater.yaml +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/argocd-image-updater.yaml @@ -21,7 +21,6 @@ config: # Whether to use plain text connection (http) instead of TLS (https) argocd.plaintext: "false" -config: gitCommitUser: ${gitCommitUser} gitCommitMail: ${gitCommitMail} gitCommitTemplate: | @@ -48,3 +47,6 @@ serviceAccount: annotations: eks.amazonaws.com/role-arn: ${roleArn} name: "argocd-image-updater" + +nodeSelector: ${nodeSelector} +tolerations: ${tolerations} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/fluentbit.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/fluentbit.yaml index af447dc95..09f6ee8d0 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/fluentbit.yaml +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/fluentbit.yaml @@ -1,6 +1,13 @@ # Either DaemonSet or Deployment kind: DaemonSet +serviceAccount: + name: fluent-bit + annotations: + eks.amazonaws.com/role-arn: ${role_arn} + +tolerations: ${tolerations} + ## Ref: https://docs.fluentbit.io/manual/administration/configuring-fluent-bit/configuration-file config: service: | @@ -9,6 +16,7 @@ config: Flush {{ .Values.flush }} Log_Level {{ .Values.logLevel }} Parsers_File parsers.conf + Parsers_File custom_parsers.conf HTTP_Server On HTTP_Listen 0.0.0.0 HTTP_Port {{ .Values.metricsPort }} @@ -30,7 +38,7 @@ config: [FILTER] Name kubernetes Match kube.* - Merge_Log Off + Merge_Log On Keep_Log Off K8S-Logging.Parser On K8S-Logging.Exclude On @@ -38,22 +46,22 @@ config: ## Ref: https://docs.fluentbit.io/manual/pipeline/outputs outputs: | [OUTPUT] - Name es + Name opensearch Match kube.* - Host ${es_host} - Port ${es_port} - HTTP_User ${es_user} - HTTP_Passwd ${es_password} + Host ${opensearch_host} + Port ${opensearch_port} + AWS_Auth On + AWS_Region ${region} tls On tls.verify On - Suppress_Type_Name On - Index kube_devstg + Index kube_${opensearch_index_suffix} + logstash_prefix kube_${opensearch_index_suffix} logstash_format On - logstash_prefix kube_devstg + Replace_Dots On + Suppress_Type_Name On time_key @timestamp Buffer_Size 4M Retry_Limit False - Suppress_Type_Name On ## Ref: https://docs.fluentbit.io/manual/pipeline/parsers customParsers: | diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/gatus.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/gatus.yaml new file mode 100644 index 000000000..23d167e7b --- /dev/null +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/gatus.yaml @@ -0,0 +1,52 @@ +#------------------------------------------------------------------------------ +# Configure endpoint to monitor +#------------------------------------------------------------------------------ +config: + services: + - name: Binbash + url: https://binbash.com.ar + interval: 30s + conditions: + - '[STATUS] == 200' + # alerts: + # - type: slack + # enabled: true + # description: "healthcheck failed 3 times in a row" + # send-on-resolved: true + - name: Leverage + url: https://leverage.binbash.com.ar + interval: 30s + conditions: + - '[STATUS] == 200' + # alerts: + # - type: slack + # enabled: true + # description: "healthcheck failed 3 times in a row" + # send-on-resolved: true + +#------------------------------------------------------------------------------ +# Alerting settings +#------------------------------------------------------------------------------ +# alerting: +# slack: +# webhook-url: "https://hooks.slack.com/services/**********/**********/**********" + +#------------------------------------------------------------------------------ +# Ingress settings +#------------------------------------------------------------------------------ +ingress: + enabled: true + annotations: + kubernetes.io/tls-acme: "true" + # Associate this ingress with the following ingress controller + kubernetes.io/ingress.class: private-apps + # Use the following cert-manager clusterissuer + cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer + # The backend is expecting to handle HTTPS + #nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" + hosts: + - ${gatusHost} + tls: + - secretName: gatus-server-tls + hosts: + - ${gatusHost} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/goldilocks.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/goldilocks.yaml new file mode 100644 index 000000000..dc5ada912 --- /dev/null +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/goldilocks.yaml @@ -0,0 +1,23 @@ +# Whether the dashboard component should be installed +dashboard: + enabled: true + replicaCount: 1 + + ingress: + enabled: true + annotations: + kubernetes.io/tls-acme: "true" + # Associate this ingress with the following ingress controller + kubernetes.io/ingress.class: private-apps + # Use the following cert-manager clusterissuer + cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer + # The backend is expecting to handle HTTPS + #nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" + hosts: + - host: ${goldilocksHost} + paths: + - / + tls: + - secretName: goldilocks-server-tls + hosts: + - ${goldilocksHost} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/kube-prometheus-stack.yaml b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/kube-prometheus-stack.yaml index 9fd6d6dd8..07603b28f 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/kube-prometheus-stack.yaml +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/chart-values/kube-prometheus-stack.yaml @@ -2,11 +2,67 @@ # AlertManager # ------------------------------------------------------------------------------------------- alertmanager: - # TODO Enable and create a working reference - enabled: false + enabled: true - nodeSelector: ${nodeSelector} - tolerations: ${tolerations} + config: + global: + slack_api_url: ${alertmanagerSlackWebhook} + route: + group_by: + - namespace + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + receiver: 'slack-notifications' + routes: + - receiver: 'slack-notifications' + continue: true + receivers: + - name: 'slack-notifications' + slack_configs: + - channel: '#${alertmanagerSlackChannel}' + send_resolved: true + text: '{{ template "slack.alert.text" }}' + + templateFiles: + alert.tmpl: |- + {{ define "slack.alert.text" }} + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` \n + *Environment:* {{ .Labels.cluster }} \n + *Description:* {{ .Annotations.description }} \n + *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> - *Runbook:* <{{ .Annotations.runbook }}|:documentation:> \n + *Details:* \n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` \n + {{ end }} + {{ end }} + {{ end }} + + ingress: + enabled: true + annotations: + kubernetes.io/tls-acme: 'true' + kubernetes.io/ingress.class: ${privateIngressClass} + cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer + hosts: + - alertmanager.${platform}.${privateBaseDomain} + path: / + tls: + - secretName: alertmananager-tls + hosts: + - alertmanager.${platform}.${privateBaseDomain} + + alertmanagerSpec: + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} + + storage: + volumeClaimTemplate: + spec: + storageClassName: gp2 + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 2Gi # ------------------------------------------------------------------------------------------- # Grafana @@ -14,26 +70,42 @@ alertmanager: grafana: enabled: true - adminUser: admin - # TODO Pass this as a secret - adminPassword: admin123456 + adminUser: ${grafanaUser} + adminPassword: ${grafanaPassword} - nodeSelector: ${nodeSelector} - tolerations: ${tolerations} + serviceAccount: + name: grafana + annotations: + eks.amazonaws.com/role-arn: ${grafanaRoleArn} ingress: enabled: true annotations: kubernetes.io/tls-acme: 'true' - kubernetes.io/ingress.class: ${private_ingress_class} + kubernetes.io/ingress.class: ${privateIngressClass} cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer hosts: - - grafana.${platform}.${private_base_domain} + - grafana.${platform}.${privateBaseDomain} path: / tls: - secretName: grafana-tls hosts: - - grafana.${platform}.${private_base_domain} + - grafana.${platform}.${privateBaseDomain} + + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} + + persistence: + enabled: true + size: 5Gi + storageClassName: gp2 + + # additionalDataSources: + # - name: CloudWatch + # type: cloudwatch + # jsonData: + # authType: default + # defaultRegion: us-east-1 # ------------------------------------------------------------------------------------------- # NodeExporter @@ -57,23 +129,34 @@ kube-state-metrics: prometheus: enabled: true - prometheusSpec: - nodeSelector: ${nodeSelector} - tolerations: ${tolerations} - ingress: enabled: true annotations: kubernetes.io/tls-acme: 'true' - kubernetes.io/ingress.class: ${private_ingress_class} + kubernetes.io/ingress.class: ${privateIngressClass} cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer hosts: - - prometheus.${platform}.${private_base_domain} + - prometheus.${platform}.${privateBaseDomain} path: / tls: - secretName: prometheus-tls hosts: - - prometheus.${platform}.${private_base_domain} + - prometheus.${platform}.${privateBaseDomain} + + prometheusSpec: + nodeSelector: ${nodeSelector} + tolerations: ${tolerations} + + retention: 30d + + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: gp2 + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi # additionalServiceMonitors: # - name: my-app @@ -122,4 +205,4 @@ kubeControllerManager: enabled: false kubeScheduler: - enabled: false \ No newline at end of file + enabled: false diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/cicd-argo.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/cicd-argo.tf index c139e11c6..ae4342e83 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/cicd-argo.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/cicd-argo.tf @@ -1,38 +1,47 @@ #------------------------------------------------------------------------------ # ArgoCD: GitOps + CD #------------------------------------------------------------------------------ +data "aws_secretsmanager_secret_version" "argocd_admin_password" { + count = var.argocd.enabled ? 1 : 0 + secret_id = "/k8s-eks-demoapps/argocdserveradminpassword" +} + data "aws_secretsmanager_secret_version" "demo_google_microservices_deploy_key" { + count = var.argocd.enabled ? 1 : 0 provider = aws.shared secret_id = "/repositories/demo-google-microservices/deploy_key" } data "aws_secretsmanager_secret_version" "le_demo_deploy_key" { + count = var.argocd.enabled ? 1 : 0 provider = aws.shared secret_id = "/repositories/le-demo-apps/deploy_key" } -# argocd_admin_password -data "aws_secretsmanager_secret" "argocd_admin_password" { - name = "/k8s-eks-demoapps/argocdserveradminpassword" -} - -# Get the latest secret version -data "aws_secretsmanager_secret_version" "argocd_admin_password" { - secret_id = data.aws_secretsmanager_secret.argocd_admin_password.id +data "aws_secretsmanager_secret_version" "argocd_slack_notifications_app_oauth" { + count = var.argocd.enabled && var.argocd.enableNotifications ? 1 : 0 + provider = aws.shared + secret_id = "/notifications/devstg/argocd" } resource "helm_release" "argocd" { - count = var.enable_cicd ? 1 : 0 + count = var.argocd.enabled ? 1 : 0 name = "argocd" namespace = kubernetes_namespace.argocd[0].id repository = "https://argoproj.github.io/argo-helm" chart = "argo-cd" - version = "5.7.0" + version = "7.7.5" values = [ templatefile("chart-values/argo-cd.yaml", { - argoHost = "argocd.${local.platform}.${local.private_base_domain}" - ingressClass = local.private_ingress_class + argoHost = "argocd.${local.platform}.${local.private_base_domain}", + ingressClass = local.private_ingress_class, + enableWebTerminal = var.argocd.enableWebTerminal, + enableNotifications = var.argocd.enableNotifications, + slackNotificationsAppToken = var.argocd.enableNotifications ? jsondecode(data.aws_secretsmanager_secret_version.argocd_slack_notifications_app_oauth[0].secret_string)["slack_app_oauth_token"] : "", + slackNotificationsChannel = local.argocd_slack_notifications_channel, + nodeSelector = local.tools_nodeSelector, + tolerations = local.tools_tolerations }), # We are using a different approach here because it is very tricky to render # properly the multi-line sshPrivateKey using 'templatefile' function @@ -40,20 +49,20 @@ resource "helm_release" "argocd" { configs = { secret = { # Get argocd admin password from AWS Secrets Manager - argocdServerAdminPassword = data.aws_secretsmanager_secret_version.argocd_admin_password.secret_string + argocdServerAdminPassword = data.aws_secretsmanager_secret_version.argocd_admin_password[0].secret_string } repositories = { demo-google-microservices = { name = "demo-google-microservices" project = "default" - sshPrivateKey = data.aws_secretsmanager_secret_version.demo_google_microservices_deploy_key.secret_string + sshPrivateKey = data.aws_secretsmanager_secret_version.demo_google_microservices_deploy_key[0].secret_string type = "git" url = "git@github.com:binbashar/demo-google-microservices.git" } le-demo-apps = { name = "le-demo-apps" project = "default" - sshPrivateKey = data.aws_secretsmanager_secret_version.le_demo_deploy_key.secret_string + sshPrivateKey = data.aws_secretsmanager_secret_version.le_demo_deploy_key[0].secret_string type = "git" url = "git@github.com:binbashar/le-demo-apps.git" } @@ -73,18 +82,20 @@ resource "helm_release" "argocd" { # ArgoCD Image Updater #------------------------------------------------------------------------------ resource "helm_release" "argocd_image_updater" { - count = var.enable_argocd_image_updater ? 1 : 0 + count = var.argocd.image_updater.enabled ? 1 : 0 name = "argocd-image-updater" namespace = kubernetes_namespace.argocd[0].id repository = "https://argoproj.github.io/argo-helm" chart = "argocd-image-updater" - version = "0.11.1" + version = "0.11.2" values = [ templatefile("chart-values/argocd-image-updater.yaml", { region = var.region argoHost = "argocd.${local.platform}.${local.private_base_domain}", repositoryApiUrl = "${var.accounts.shared.id}.dkr.ecr.${var.region}.amazonaws.com", roleArn = data.terraform_remote_state.cluster-identities.outputs.argo_cd_image_updater_role_arn, + nodeSelector = local.tools_nodeSelector, + tolerations = local.tools_tolerations, gitCommitUser = "binbash-machine-user" gitCommitMail = "leverage-aws+machine-user@binbash.com.ar" gitCommitMessageTemplate = <<-TMP @@ -106,18 +117,22 @@ resource "helm_release" "argocd_image_updater" { # Argo Rollouts #------------------------------------------------------------------------------ resource "helm_release" "argo_rollouts" { - count = var.enable_argo_rollouts ? 1 : 0 + count = var.argocd.rollouts.enabled ? 1 : 0 name = "argo-rollouts" namespace = kubernetes_namespace.argocd[0].id repository = "https://argoproj.github.io/argo-helm" chart = "argo-rollouts" - version = "2.37.0" + version = "2.38.0" values = [ templatefile("chart-values/argo-rollouts.yaml", { - rolloutsHost = "rollouts.${local.platform}.${local.private_base_domain}" - ingressClass = local.private_ingress_class - })] + enableDashboard = var.argocd.rollouts.dashboard.enabled, + rolloutsHost = "rollouts.${local.platform}.${local.private_base_domain}", + ingressClass = local.private_ingress_class, + nodeSelector = local.tools_nodeSelector, + tolerations = local.tools_tolerations + }) + ] depends_on = [ helm_release.alb_ingress, diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf deleted file mode 100644 index fe03a7452..000000000 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf +++ /dev/null @@ -1,133 +0,0 @@ -#================================# -# Common variables # -#================================# - -# -# config/backend.config -# -#================================# -# Terraform AWS Backend Settings # -#================================# -variable "region" { - type = string - description = "AWS Region" -} - -variable "profile" { - type = string - description = "AWS Profile (required by the backend but also used for other resources)" -} - -variable "bucket" { - type = string - description = "AWS S3 TF State Backend Bucket" -} - -variable "dynamodb_table" { - type = string - description = "AWS DynamoDB TF Lock state table name" -} - -variable "encrypt" { - type = bool - description = "Enable AWS DynamoDB with server side encryption" -} - -# -# config/base.config -# -#=============================# -# Project Variables # -#=============================# -variable "project" { - type = string - description = "Project Name" -} - -variable "project_long" { - type = string - description = "Project Long Name" -} - -variable "environment" { - type = string - description = "Environment Name" -} - -# -# config/extra.config -# -#=============================# -# Accounts & Extra Vars # -#=============================# -variable "region_secondary" { - type = string - description = "AWS Scondary Region for HA" -} - -variable "accounts" { - type = map(any) - description = "Accounts descriptions" -} - -variable "vault_address" { - type = string - description = "Vault Address" -} - -variable "vault_token" { - type = string - description = "Vault Token" -} - -#=============================# -# AWS SSO Variables # -#=============================# -variable "sso_role" { - description = "SSO Role Name" -} - -variable "sso_enabled" { - type = string - description = "Enable SSO Service" -} - -variable "sso_region" { - type = string - description = "SSO Region" -} - -variable "sso_start_url" { - type = string - description = "SSO Start Url" -} - -#===========================================# -# Networking # -#===========================================# -variable "enable_tgw" { - description = "Enable Transit Gateway Support" - type = bool - default = false -} - -variable "enable_tgw_multi_region" { - description = "Enable Transit Gateway multi region support" - type = bool - default = false -} - -variable "tgw_cidrs" { - description = "CIDRs to be added as routes to public RT" - type = list(string) - default = [] -} - -#===========================================# -# Security compliance -#===========================================# -variable "enable_inspector" { - description = "Turn inspector on/off" - type = bool - default = false -} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf new file mode 120000 index 000000000..2f807a597 --- /dev/null +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/common-variables.tf @@ -0,0 +1 @@ +../../../../config/common-variables.tf \ No newline at end of file diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/config.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/config.tf index 222487c47..e0ca6a8ae 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/config.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/config.tf @@ -30,7 +30,7 @@ provider "helm" { # Backend Config (partial) #------------------------------------------------------------------------------ terraform { - required_version = "~> 1.3" + required_version = "~> 1.6" required_providers { aws = "~> 5.24" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-external-prometheus.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-external-prometheus.tf index 1816c5057..94dd5c76d 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-external-prometheus.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-external-prometheus.tf @@ -2,7 +2,7 @@ # Service Account & Permissions: External Prometheus #------------------------------------------------------------------------------ resource "kubernetes_cluster_role" "external_prometheus" { - count = var.enable_prometheus_dependencies ? 1 : 0 + count = var.prometheus.external.dependencies.enabled ? 1 : 0 metadata { name = "external-prometheus" @@ -38,7 +38,7 @@ resource "kubernetes_cluster_role" "external_prometheus" { } resource "kubernetes_cluster_role_binding" "external_prometheus" { - count = var.enable_prometheus_dependencies ? 1 : 0 + count = var.prometheus.external.dependencies.enabled ? 1 : 0 metadata { name = "external-prometheus" @@ -56,7 +56,7 @@ resource "kubernetes_cluster_role_binding" "external_prometheus" { } resource "kubernetes_service_account" "external_prometheus" { - count = var.enable_prometheus_dependencies ? 1 : 0 + count = var.prometheus.external.dependencies.enabled ? 1 : 0 metadata { name = "external-prometheus" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-grafana-kubegraf.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-grafana-kubegraf.tf index 28056f444..20bd1c47d 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-grafana-kubegraf.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/identity-grafana-kubegraf.tf @@ -2,7 +2,7 @@ # Service Account & Permissions: Grafana KubeGraf Application #------------------------------------------------------------------------------ resource "kubernetes_cluster_role" "grafana_kubegraf" { - count = var.enable_grafana_dependencies ? 1 : 0 + count = var.prometheus.external.grafana_dependencies.enabled ? 1 : 0 metadata { name = "grafana-kubegraf" @@ -42,7 +42,7 @@ resource "kubernetes_cluster_role" "grafana_kubegraf" { } resource "kubernetes_cluster_role_binding" "grafana_kubegraf" { - count = var.enable_grafana_dependencies ? 1 : 0 + count = var.prometheus.external.grafana_dependencies.enabled ? 1 : 0 metadata { name = "grafana-kubegraf" @@ -60,7 +60,7 @@ resource "kubernetes_cluster_role_binding" "grafana_kubegraf" { } resource "kubernetes_service_account" "grafana_kubegraf" { - count = var.enable_grafana_dependencies ? 1 : 0 + count = var.prometheus.external.grafana_dependencies.enabled ? 1 : 0 metadata { name = "grafana-kubegraf" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/locals.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/locals.tf index 3cccacb8f..b825d6559 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/locals.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/locals.tf @@ -55,7 +55,12 @@ locals { alb_ingress_to_nginx_ingress_tags_list = [ for k, v in local.alb_ingress_to_nginx_ingress_tags_map : "${k}=${v}" ] - eks_alb_logging_prefix = var.eks_alb_logging_prefix != "" ? var.eks_alb_logging_prefix : data.terraform_remote_state.cluster.outputs.cluster_name + eks_alb_logging_prefix = var.ingress.apps_ingress.logging.prefix != "" ? var.ingress.apps_ingress.logging.prefix : data.terraform_remote_state.cluster.outputs.cluster_name + + #------------------------------------------------------------------------------ + # Argo Settings + #------------------------------------------------------------------------------ + argocd_slack_notifications_channel = "le-tools-monitoring" #------------------------------------------------------------------------------ # Tools Node Group: Selectors and Tolerations diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-alerts.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-alerts.tf index 59c05a760..52c48f28a 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-alerts.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-alerts.tf @@ -9,7 +9,7 @@ # around to immediately check what's wrong. #------------------------------------------------------------------------------ resource "helm_release" "kwatch" { - count = var.enable_kwatch ? 1 : 0 + count = var.kwatch.enabled ? 1 : 0 name = "kwatch" namespace = kubernetes_namespace.monitoring_alerts[0].id repository = "https://kwatch.dev/charts" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-cost.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-cost.tf index 4c8c54694..51640394e 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-cost.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-cost.tf @@ -28,7 +28,7 @@ resource "helm_release" "kube_resource_report" { # `http://localhost:9090 #------------------------------------------------------------------------------ resource "helm_release" "cost_analyzer" { - count = var.cost_optimization.cost_analyzer && !var.enable_prometheus_stack ? 1 : 0 + count = var.cost_optimization.cost_analyzer && !var.prometheus.kube_stack.enabled ? 1 : 0 name = "cost-analyzer" namespace = kubernetes_namespace.monitoring_tools[0].id repository = "https://kubecost.github.io/cost-analyzer/" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-logging.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-logging.tf index cee9a7568..046a11144 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-logging.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-logging.tf @@ -8,13 +8,15 @@ resource "helm_release" "fluentbit" { namespace = kubernetes_namespace.monitoring_logging[0].id repository = "https://fluent.github.io/helm-charts" chart = "fluent-bit" - version = "0.20.1" + version = "0.24.0" values = [ templatefile("chart-values/fluentbit.yaml", { - es_host = "elasticsearch.${local.private_base_domain}", - es_port = 443, - es_user = "elastic.user", # TODO pass secret via AWS Screts Manager - es_password = "elastic.password" # TODO pass secret via AWS Screts Manager + opensearch_host = "example-domain.${local.private_base_domain}", # Fetch this from a opensearch layer output + opensearch_port = 443, + opensearch_index_suffix = local.environment + region = var.region, + role_arn = data.terraform_remote_state.cluster-identities.outputs.fluent_bit_role_arn, # Make sure the role allows access to the domain set above + tolerations = local.tools_tolerations, }) ] } diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-metrics.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-metrics.tf index af3264362..5d01ee9d0 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-metrics.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-metrics.tf @@ -2,7 +2,7 @@ # Kube State Metrics: Expose cluster metrics. #------------------------------------------------------------------------------ resource "helm_release" "kube_state_metrics" { - count = var.enable_prometheus_dependencies ? 1 : 0 + count = var.prometheus.external.dependencies.enabled ? 1 : 0 name = "kube-state-metrics" namespace = kubernetes_namespace.monitoring_metrics[0].id repository = "https://charts.bitnami.com/bitnami" @@ -15,7 +15,7 @@ resource "helm_release" "kube_state_metrics" { # Node Exporter: Expose cluster node metrics. # ------------------------------------------------------------------------------ resource "helm_release" "node_exporter" { - count = var.enable_prometheus_dependencies ? 1 : 0 + count = var.prometheus.external.dependencies.enabled ? 1 : 0 name = "node-exporter" namespace = kubernetes_namespace.monitoring_metrics[0].id repository = "https://charts.bitnami.com/bitnami" @@ -28,7 +28,7 @@ resource "helm_release" "node_exporter" { # Metrics Server: Expose cluster metrics. #------------------------------------------------------------------------------ resource "helm_release" "metrics_server" { - count = (var.enable_hpa_scaling || var.enable_vpa_scaling) ? 1 : 0 + count = (var.scaling.hpa.enabled || var.scaling.vpa.enabled) ? 1 : 0 name = "metrics-server" namespace = kubernetes_namespace.monitoring_metrics[0].id repository = "https://charts.bitnami.com/bitnami" @@ -37,11 +37,30 @@ resource "helm_release" "metrics_server" { values = [file("chart-values/metrics-server.yaml")] } -#------------------------------------------------------------------------------ -# Prometheus Stack: (in-cluster) Prometheus, Grafana, and AlertManager. -#------------------------------------------------------------------------------ +#-------------------------------------------------------------------------------- +# Kube Prometheus Stack: Full Prometheus + Alertmanager + Grafana implementation. +#-------------------------------------------------------------------------------- + +# +# Slack webhook +# +data "aws_secretsmanager_secret_version" "alertmanager_slack_webhook" { + count = var.prometheus.kube_stack.enabled && var.prometheus.kube_stack.alertmanager.enabled ? 1 : 0 + provider = aws.shared + secret_id = "/notifications/alertmanager" +} + +# +# Grafana's credentials +# +data "aws_secretsmanager_secret_version" "grafana" { + count = var.prometheus.kube_stack.enabled ? 1 : 0 + provider = aws.shared + secret_id = "/devops/monitoring/grafana/administrator" +} + resource "helm_release" "kube_prometheus_stack" { - count = var.kube_prometheus_stack.enabled && !var.cost_optimization.cost_analyzer ? 1 : 0 + count = var.prometheus.kube_stack.enabled && !var.cost_optimization.cost_analyzer ? 1 : 0 name = "kube-prometheus-stack" namespace = kubernetes_namespace.prometheus[0].id repository = "https://prometheus-community.github.io/helm-charts" @@ -49,11 +68,16 @@ resource "helm_release" "kube_prometheus_stack" { version = "52.1.0" values = [templatefile("chart-values/kube-prometheus-stack.yaml", { - private_ingress_class = local.private_ingress_class - platform = local.platform - private_base_domain = local.private_base_domain - nodeSelector = local.tools_nodeSelector - tolerations = local.tools_tolerations + privateIngressClass = local.private_ingress_class + platform = local.platform + privateBaseDomain = local.private_base_domain + alertmanagerSlackWebhook = var.prometheus.kube_stack.alertmanager.enabled ? jsondecode(data.aws_secretsmanager_secret_version.alertmanager_slack_webhook[0].secret_string)["webhook"] : "" + alertmanagerSlackChannel = var.prometheus.kube_stack.alertmanager.enabled ? jsondecode(data.aws_secretsmanager_secret_version.alertmanager_slack_webhook[0].secret_string)["channel"] : "" + grafanaUser = jsondecode(data.aws_secretsmanager_secret_version.grafana[0].secret_string)["username"] + grafanaPassword = jsondecode(data.aws_secretsmanager_secret_version.grafana[0].secret_string)["password"] + grafanaRoleArn = data.terraform_remote_state.cluster-identities.outputs.grafana_role_arn + nodeSelector = local.tools_nodeSelector + tolerations = local.tools_tolerations }) ] } diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-other.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-other.tf index b7eb24e5d..d66943b19 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-other.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/monitoring-other.tf @@ -2,7 +2,7 @@ # Datadog Agent #------------------------------------------------------------------------------ resource "helm_release" "datadog_agent" { - count = var.enable_datadog_agent ? 1 : 0 + count = var.datadog_agent.enabled ? 1 : 0 name = "datadog" namespace = kubernetes_namespace.monitoring_other[0].id repository = "https://helm.datadoghq.com" @@ -37,7 +37,7 @@ resource "helm_release" "datadog_agent" { # - Back up the volume used by Kuma and define/rehearse the restore procedure. #------------------------------------------------------------------------------ resource "helm_release" "uptime_kuma" { - count = var.enable_uptime_kuma ? 1 : 0 + count = var.uptime_kuma.enabled ? 1 : 0 name = "uptime-kuma" namespace = kubernetes_namespace.monitoring_other[0].id repository = "https://helm.irsigler.cloud" @@ -63,3 +63,25 @@ resource "helm_release" "uptime_kuma" { EOT ] } + +#------------------------------------------------------------------------------ +# Gatus: Monitor HTTP, TCP, ICMP and DNS. +#------------------------------------------------------------------------------ +resource "helm_release" "gatus" { + count = var.gatus.enabled ? 1 : 0 + name = "gatus" + namespace = kubernetes_namespace.monitoring_other[0].id + repository = "https://minicloudlabs.github.io/helm-charts" + chart = "gatus" + version = "1.1.4" + values = [ + templatefile("chart-values/gatus.yaml", { + gatusHost = "gatus.${local.platform}.${local.private_base_domain}" + }) + ] + depends_on = [ + helm_release.ingress_nginx_private, + helm_release.certmanager, + helm_release.externaldns_private + ] +} diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/namespaces.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/namespaces.tf index 03e8784a0..b57e22557 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/namespaces.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/namespaces.tf @@ -1,5 +1,5 @@ resource "kubernetes_namespace" "monitoring_metrics" { - count = var.enable_prometheus_dependencies || var.enable_prometheus_dependencies || var.enable_cluster_autoscaling || var.enable_hpa_scaling || var.enable_vpa_scaling ? 1 : 0 + count = var.prometheus.external.dependencies.enabled || var.scaling.cluster_autoscaling.enabled || var.scaling.hpa.enabled || var.scaling.vpa.enabled || var.goldilocks.enabled ? 1 : 0 metadata { labels = local.labels @@ -17,7 +17,7 @@ resource "kubernetes_namespace" "monitoring_logging" { } resource "kubernetes_namespace" "monitoring_tools" { - count = var.enable_kubernetes_dashboard || var.enable_vpa_scaling || var.cost_optimization.kube_resource_report || var.cost_optimization.cost_analyzer ? 1 : 0 + count = var.scaling.vpa.enabled || var.cost_optimization.kube_resource_report || var.cost_optimization.cost_analyzer ? 1 : 0 metadata { labels = local.labels @@ -26,7 +26,7 @@ resource "kubernetes_namespace" "monitoring_tools" { } resource "kubernetes_namespace" "monitoring_other" { - count = var.enable_datadog_agent || var.enable_uptime_kuma ? 1 : 0 + count = var.datadog_agent.enabled || var.uptime_kuma.enabled || var.gatus.enabled ? 1 : 0 metadata { labels = local.labels @@ -35,7 +35,7 @@ resource "kubernetes_namespace" "monitoring_other" { } resource "kubernetes_namespace" "monitoring_alerts" { - count = var.enable_kwatch ? 1 : 0 + count = var.kwatch.enabled ? 1 : 0 metadata { labels = local.labels @@ -44,7 +44,7 @@ resource "kubernetes_namespace" "monitoring_alerts" { } resource "kubernetes_namespace" "ingress_nginx" { - count = var.enable_nginx_ingress_controller ? 1 : 0 + count = var.ingress.nginx_controller.enabled ? 1 : 0 metadata { labels = local.labels @@ -53,7 +53,7 @@ resource "kubernetes_namespace" "ingress_nginx" { } resource "kubernetes_namespace" "alb_ingress" { - count = var.enable_alb_ingress_controller ? 1 : 0 + count = var.ingress.alb_controller.enabled ? 1 : 0 metadata { labels = local.labels @@ -62,7 +62,7 @@ resource "kubernetes_namespace" "alb_ingress" { } resource "kubernetes_namespace" "certmanager" { - count = var.enable_certmanager ? 1 : 0 + count = var.certmanager.enabled ? 1 : 0 metadata { labels = local.labels @@ -71,7 +71,7 @@ resource "kubernetes_namespace" "certmanager" { } resource "kubernetes_namespace" "externaldns" { - count = var.enable_private_dns_sync || var.enable_public_dns_sync ? 1 : 0 + count = var.dns_sync.private.enabled || var.dns_sync.private.enabled ? 1 : 0 metadata { labels = local.labels @@ -79,17 +79,8 @@ resource "kubernetes_namespace" "externaldns" { } } -resource "kubernetes_namespace" "vault" { - count = var.enable_vault ? 1 : 0 - - metadata { - labels = local.labels - name = "vault" - } -} - resource "kubernetes_namespace" "external-secrets" { - count = var.enable_external_secrets ? 1 : 0 + count = var.external_secrets.enabled ? 1 : 0 metadata { labels = local.labels @@ -98,7 +89,7 @@ resource "kubernetes_namespace" "external-secrets" { } resource "kubernetes_namespace" "argocd" { - count = var.enable_cicd || var.enable_argocd_image_updater || var.enable_argo_rollouts ? 1 : 0 + count = var.argocd.enabled || var.argocd.image_updater.enabled || var.argocd.rollouts.enabled ? 1 : 0 metadata { labels = local.labels @@ -106,30 +97,8 @@ resource "kubernetes_namespace" "argocd" { } } -resource "kubernetes_namespace" "gatus" { - count = var.enable_gatus ? 1 : 0 - - metadata { - labels = { - environment = var.environment - "goldilocks.fairwinds.com/enabled" = "true" - } - - name = "gatus" - } -} - -resource "kubernetes_namespace" "velero" { - count = var.enable_backups ? 1 : 0 - - metadata { - labels = local.labels - name = "velero" - } -} - resource "kubernetes_namespace" "prometheus" { - count = var.kube_prometheus_stack.enabled ? 1 : 0 + count = var.prometheus.kube_stack.enabled ? 1 : 0 metadata { labels = local.labels @@ -138,7 +107,7 @@ resource "kubernetes_namespace" "prometheus" { } resource "kubernetes_namespace" "scaling" { - count = var.enable_cluster_overprovisioning ? 1 : 0 + count = var.scaling.cluster_overprovisioning.enabled ? 1 : 0 metadata { labels = local.labels @@ -147,7 +116,7 @@ resource "kubernetes_namespace" "scaling" { } resource "kubernetes_namespace" "keda" { - count = var.enable_keda ? 1 : 0 + count = var.keda.enabled ? 1 : 0 metadata { labels = local.labels diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-dns.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-dns.tf index 2c1283398..3dec16192 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-dns.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-dns.tf @@ -2,7 +2,7 @@ # External DNS (Private): Sync ingresses hosts with your DNS server. #------------------------------------------------------------------------------ resource "helm_release" "externaldns_private" { - count = var.enable_private_dns_sync ? 1 : 0 + count = var.dns_sync.private.enabled ? 1 : 0 # depends_on = [null_resource.download] @@ -28,7 +28,7 @@ resource "helm_release" "externaldns_private" { # External DNS (Public): Sync ingresses hosts with your DNS server. #------------------------------------------------------------------------------ resource "helm_release" "externaldns_public" { - count = var.enable_public_dns_sync ? 1 : 0 + count = var.dns_sync.public.enabled ? 1 : 0 name = "externaldns-public" namespace = kubernetes_namespace.externaldns[0].id diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-ingress.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-ingress.tf index cb9f1e394..027e73549 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-ingress.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/networking-ingress.tf @@ -2,7 +2,7 @@ # AWS Load Balancer (Ingress) Controller: Route outside traffic to the cluster. #------------------------------------------------------------------------------ resource "helm_release" "alb_ingress" { - count = var.enable_alb_ingress_controller ? 1 : 0 + count = var.ingress.alb_controller.enabled ? 1 : 0 name = "alb-ingress" namespace = kubernetes_namespace.alb_ingress[0].id repository = "https://aws.github.io/eks-charts" @@ -22,7 +22,7 @@ resource "helm_release" "alb_ingress" { # Nginx Ingress (Private): Route inside traffic to services in the cluster. #------------------------------------------------------------------------------ resource "helm_release" "ingress_nginx_private" { - count = var.enable_nginx_ingress_controller ? 1 : 0 + count = var.ingress.nginx_controller.enabled ? 1 : 0 name = "ingress-nginx-private" namespace = kubernetes_namespace.ingress_nginx[0].id repository = "https://kubernetes.github.io/ingress-nginx" @@ -54,7 +54,7 @@ resource "helm_release" "ingress_nginx_private" { # #------------------------------------------------------------------------------ resource "kubernetes_ingress_v1" "apps" { - count = var.apps_ingress.enabled ? 1 : 0 + count = var.ingress.apps_ingress.enabled ? 1 : 0 wait_for_load_balancer = true metadata { @@ -64,7 +64,7 @@ resource "kubernetes_ingress_v1" "apps" { # This is used by the ALB Ingress "kubernetes.io/ingress.class" = "${local.public_ingress_class}" # Load balancer type: internet-facing or internal - "alb.ingress.kubernetes.io/scheme" = var.apps_ingress.type + "alb.ingress.kubernetes.io/scheme" = var.ingress.apps_ingress.type # Group this LB under a custom group so it's not shared with other groups "alb.ingress.kubernetes.io/group.name" = "apps" # Nginx provides an endpoint for health checks @@ -83,7 +83,7 @@ resource "kubernetes_ingress_v1" "apps" { # NOTE: this is highly recommended when using an internet-facing ALB "alb.ingress.kubernetes.io/inbound-cidrs" = "0.0.0.0/0" # ALB access logs - "alb.ingress.kubernetes.io/load-balancer-attributes" = "access_logs.s3.enabled=${var.enable_eks_alb_logging},access_logs.s3.bucket=${var.project}-${var.environment}-alb-logs,access_logs.s3.prefix=${local.eks_alb_logging_prefix}" + "alb.ingress.kubernetes.io/load-balancer-attributes" = "access_logs.s3.enabled=${var.ingress.apps_ingress.logging.enabled},access_logs.s3.bucket=${var.project}-${var.environment}-alb-logs,access_logs.s3.prefix=${local.eks_alb_logging_prefix}" } } diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/scaling.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/scaling.tf index 5a4e2fe72..86442641d 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/scaling.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/scaling.tf @@ -2,7 +2,7 @@ # Vertical Pod Autoscaler: automatic pod vertical autoscaling. #------------------------------------------------------------------------------ resource "helm_release" "vpa" { - count = var.enable_vpa_scaling ? 1 : 0 + count = var.scaling.vpa.enabled ? 1 : 0 name = "vpa" namespace = kubernetes_namespace.monitoring_metrics[0].id repository = "https://charts.fairwinds.com/stable" @@ -16,7 +16,7 @@ resource "helm_release" "vpa" { # Cluster Autoscaler: automatic cluster nodes autoscaling. #------------------------------------------------------------------------------ resource "helm_release" "cluster_autoscaling" { - count = var.enable_cluster_autoscaling ? 1 : 0 + count = var.scaling.cluster_autoscaling.enabled ? 1 : 0 name = "autoscaler" namespace = kubernetes_namespace.monitoring_metrics[0].id repository = "https://kubernetes.github.io/autoscaler" @@ -50,7 +50,7 @@ resource "helm_release" "cluster_autoscaling" { # Another option is to start with one replica and then use the proportional # autoscaler to control the minimum number of replicas there. resource "helm_release" "cluster_overprovisioner" { - count = var.enable_cluster_overprovisioning ? 1 : 0 + count = var.scaling.cluster_overprovisioning.enabled ? 1 : 0 name = "cluster-overprovisioner" namespace = kubernetes_namespace.scaling[0].id repository = "https://charts.deliveryhero.io/" @@ -83,7 +83,7 @@ EOF # targets must, as mush as possible, be assigned to a new node. # - Also, don't forget about using proper values for the min and max settings. resource "helm_release" "cluster_proportional_autoscaler" { - count = var.enable_cluster_overprovisioning ? 1 : 0 + count = var.scaling.cluster_overprovisioning.enabled ? 1 : 0 name = "cluster-proportional-autoscaler" namespace = kubernetes_namespace.scaling[0].id repository = "https://kubernetes-sigs.github.io/cluster-proportional-autoscaler" @@ -121,21 +121,39 @@ EOF # resource utilization, custom metrics, and external events. #------------------------------------------------------------------------------ resource "helm_release" "keda" { - count = var.enable_keda ? 1 : 0 + count = var.keda.enabled ? 1 : 0 name = "keda" namespace = kubernetes_namespace.keda[0].id repository = "https://kedacore.github.io/charts" chart = "keda" version = "2.15.0" - values = [] + values = [] } resource "helm_release" "keda_http_add_on" { - count = var.enable_keda && var.enable_keda_http_add_on ? 1 : 0 + count = var.keda.enabled && var.keda.http_add_on.enabled ? 1 : 0 name = "http-add-on" namespace = kubernetes_namespace.keda[0].id repository = "https://kedacore.github.io/charts" chart = "keda-add-ons-http" version = "0.8.0" - values = [] + values = [] +} + +# ------------------------------------------------------------------------------ +# Goldilocks: tune up resource requests and limits. +# ------------------------------------------------------------------------------ +resource "helm_release" "goldilocks" { + count = var.goldilocks.enabled ? 1 : 0 + name = "goldilocks" + namespace = kubernetes_namespace.monitoring_metrics[0].id + repository = "https://charts.fairwinds.com/stable" + chart = "goldilocks" + version = "3.2.1" + values = [ + templatefile("chart-values/goldilocks.yaml", { + goldilocksHost = "goldilocks.${local.platform}.${local.private_base_domain}" + }) + ] + depends_on = [helm_release.vpa] } diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/security.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/security.tf index 504d55822..d9559ebda 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/security.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/security.tf @@ -2,7 +2,7 @@ # Cert-Manager: Automatically get Let's Encrypt certificate for your ingress. #------------------------------------------------------------------------------ resource "helm_release" "certmanager" { - count = var.enable_certmanager ? 1 : 0 + count = var.certmanager.enabled ? 1 : 0 name = "certmanager" namespace = kubernetes_namespace.certmanager[0].id repository = "https://charts.jetstack.io" @@ -19,7 +19,7 @@ resource "helm_release" "certmanager" { # Cert-Manager Cluster Issuer: Certificate issuer for Binbash domains. #------------------------------------------------------------------------------ resource "helm_release" "clusterissuer_binbash" { - count = var.enable_certmanager ? 1 : 0 + count = var.certmanager.enabled ? 1 : 0 name = "clusterissuer-binbash" namespace = kubernetes_namespace.certmanager[0].id repository = "https://binbashar.github.io/helm-charts/" @@ -39,7 +39,7 @@ resource "helm_release" "clusterissuer_binbash" { # External Secrets Operator: Automated 3rd party Service secrets injection. #------------------------------------------------------------------------------ resource "helm_release" "external_secrets" { - count = var.enable_external_secrets ? 1 : 0 + count = var.external_secrets.enabled ? 1 : 0 name = "external-secrets" namespace = kubernetes_namespace.external-secrets[0].id repository = "https://charts.external-secrets.io" @@ -53,7 +53,7 @@ resource "helm_release" "external_secrets" { } resource "helm_release" "cluster_secrets_manager" { - count = var.enable_external_secrets ? 1 : 0 + count = var.external_secrets.enabled ? 1 : 0 name = "cluster-secrets-manager" namespace = kubernetes_namespace.external-secrets[0].id @@ -85,12 +85,12 @@ resource "helm_release" "cluster_secrets_manager" { # These resources below (cluster_parameter_store) need to be commented out and applied in a second step # The reason behind this can be found in this issue: https://github.com/hashicorp/terraform-provider-kubernetes/issues/1367#issuecomment-1239205722 -# and the surounding discussion. +# and the surrounding discussion. # TODO: Move onto using a raw YAML helm chart as in https://github.com/itscontained/charts/tree/master/itscontained/raw # resource "kubernetes_manifest" "cluster_parameter_store" { -# count = var.enable_external_secrets ? 1 : 0 +# count = var.external_secrets.enabled ? 1 : 0 # manifest = { # "apiVersion" = "external-secrets.io/v1beta1" diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/terraform.tfvars b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/terraform.tfvars index fdeaeb725..af23bfcae 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/terraform.tfvars +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/terraform.tfvars @@ -1,40 +1,93 @@ #------------------------------------------------------------------------------ # Ingress #------------------------------------------------------------------------------ -enable_alb_ingress_controller = false -enable_nginx_ingress_controller = true -apps_ingress = { - enabled = false - # Load balancer type: internet-facing or internal - type = "internal" +ingress = { + alb_controller = { + enabled = false + } + + nginx_controller = { + enabled = true + } + + apps_ingress = { + enabled = false + # Load balancer type: internet-facing or internal + type = "internal" + + logging = { + enabled = false + prefix = "" + } + } } #------------------------------------------------------------------------------ # Certificate Manager #------------------------------------------------------------------------------ -enable_certmanager = true +certmanager = { + enabled = true +} #------------------------------------------------------------------------------ # External DNS sync #------------------------------------------------------------------------------ -enable_private_dns_sync = true -enable_public_dns_sync = false +dns_sync = { + private = { + enabled = true + } + + public = { + enabled = false + } +} #------------------------------------------------------------------------------ # Secrets Management #------------------------------------------------------------------------------ -enable_external_secrets = true +external_secrets = { + enabled = true +} #------------------------------------------------------------------------------ # Scaling #------------------------------------------------------------------------------ -enable_hpa_scaling = false -enable_vpa_scaling = false -enable_cluster_autoscaling = true -enable_cluster_overprovisioning = false -enable_keda = false -enable_keda_http_add_on = false +scaling = { + hpa = { + enabled = false + } + + vpa = { + enabled = false + } + + cluster_autoscaling = { + enabled = true + } + + cluster_overprovisioning = { + enabled = false + } +} +#------------------------------------------------------------------------------ +# Scaling: Goldilocks +#------------------------------------------------------------------------------ +goldilocks = { + enabled = true +} + + +#------------------------------------------------------------------------------ +# Scaling: Keda +#------------------------------------------------------------------------------ +keda = { + enabled = false + + http_add_on = { + enabled = false + } +} #------------------------------------------------------------------------------ # Monitoring: Logging #------------------------------------------------------------------------------ @@ -50,34 +103,78 @@ logging = { } #------------------------------------------------------------------------------ -# Monitoring: Metrics +# Monitoring: Prometheus #------------------------------------------------------------------------------ # KubePrometheusStack -kube_prometheus_stack = { - enabled = false +prometheus = { + kube_stack = { + enabled = true + + alertmanager = { + enabled = false + } + } + + external = { + dependencies = { + enabled = false + } + grafana_dependencies = { + enabled = false + } + } } -# (External) Prometheus dependencies -enable_prometheus_dependencies = false -enable_grafana_dependencies = false #------------------------------------------------------------------------------ # Monitoring: Datadog (logs, metrics, and more) #------------------------------------------------------------------------------ -enable_datadog_agent = false +datadog_agent = { + enabled = false +} #------------------------------------------------------------------------------ # Monitoring: Alerts #------------------------------------------------------------------------------ # KWatch -enable_kwatch = false +kwatch = { + enabled = false +} + +#------------------------------------------------------------------------------ +# Monitoring: Uptime Kuma +#------------------------------------------------------------------------------ +uptime_kuma = { + enabled = false +} #------------------------------------------------------------------------------ -# CICD | ArgoCD +# Monitoring: Gatus #------------------------------------------------------------------------------ -enable_cicd = true -enable_argocd_image_updater = true -enable_argo_rollouts = false +gatus = { + enabled = true +} + +#------------------------------------------------------------------------------ +# CICD | Argo +#------------------------------------------------------------------------------ +argocd = { + enabled = true + + enableWebTerminal = true + enableNotifications = false + + image_updater = { + enabled = true + } + rollouts = { + enabled = true + + dashboard = { + enabled = false + } + } +} #------------------------------------------------------------------------------ # FinOps | Cost Optimizations Tools @@ -86,8 +183,3 @@ cost_optimization = { kube_resource_report = false cost_analyzer = false } - -#------------------------------------------------------------------------------ -# Uptime Kuma -#------------------------------------------------------------------------------ -enable_uptime_kuma = false diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/variables.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/variables.tf index a44863083..8c3056973 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/variables.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/k8s-components/variables.tf @@ -1,206 +1,213 @@ #=============================# # Layer Flags # #=============================# -variable "enable_nginx_ingress_controller" { - type = bool - default = false -} - -variable "enable_alb_ingress_controller" { - type = bool - default = false -} - -variable "apps_ingress" { - type = any - default = {} -} +variable "ingress" { + type = object({ + alb_controller = map(any) + nginx_controller = map(any) + apps_ingress = object({ + enabled = bool + type = string + logging = object({ + enabled = bool + prefix = string + }) + }) + }) + default = { + alb_controller = { + enabled = true + } -variable "enable_private_dns_sync" { - type = bool - default = false -} + nginx_controller = { + enabled = true + } -variable "enable_public_dns_sync" { - type = bool - default = false -} + apps_ingress = { + enabled = false -variable "enable_certmanager" { - type = bool - default = false -} + type = "internal" -variable "enable_vault" { - type = bool - default = false + logging = { + enabled = false + prefix = "" + } + } + } } -variable "enable_external_secrets" { - type = bool - default = false +variable "certmanager" { + type = map(any) + default = { + enabled = true + } } -variable "enable_cicd" { - type = bool - default = false -} +variable "dns_sync" { + type = map(any) + default = { + private = { + enabled = true + } -variable "enable_argocd_image_updater" { - type = bool - default = false + public = { + enabled = false + } + } } -variable "enable_argo_rollouts" { - type = bool - default = false +variable "external_secrets" { + type = map(any) + default = { + enabled = true + } } -variable "enable_hpa_scaling" { - type = bool - default = false -} +variable "scaling" { + type = map(any) + default = { + hpa = { + enabled = false + } -variable "enable_vpa_scaling" { - type = bool - default = false -} + vpa = { + enabled = false + } -variable "enable_cluster_autoscaling" { - type = bool - default = false -} + cluster_autoscaling = { + enabled = false + } -variable "enable_cluster_overprovisioning" { - type = bool - default = false + cluster_overprovisionning = { + enabled = false + } + } } -variable "enable_gatus" { - type = bool - default = false +variable "goldilocks" { + type = map(any) + default = { + enabled = false + } } variable "logging" { - type = any - default = {} + type = object({ + enabled = bool, + forwarders = list(string) + }) + default = { + enabled = false + + forwarders = [] + } } -variable "enable_ingressmonitorcontroller" { - type = bool - default = false +variable "prometheus" { + type = object({ + kube_stack = object({ + enabled = bool, + alertmanager = map(any) + }) + external = map(any) + }) + default = { + kube_stack = { + enabled = true + + alertmanager = { + enabled = false + } + } + + external = { + dependencies = { + enabled = false + } + + grafana_dependencies = { + enabled = false + } + } + } } -variable "kube_prometheus_stack" { +variable "datadog_agent" { type = map(any) default = { enabled = false } } -variable "enable_prometheus_dependencies" { - type = bool - default = false -} - -variable "enable_grafana_dependencies" { - type = bool - default = false -} - -variable "enable_kubernetes_dashboard" { - type = bool - default = false -} - -variable "kubernetes_dashboard_ingress_class" { - type = string - default = "private-apps" +variable "kwatch" { + type = map(any) + default = { + enabled = false + } } -variable "kubernetes_dashboard_hosts" { - type = string - default = "kubernetes-dashboard.devstg.aws.binbash.com.ar" +variable "uptime_kuma" { + type = map(any) + default = { + enabled = false + } } -variable "enable_backups" { - type = bool - default = false +variable "gatus" { + type = map(any) + default = { + enabled = false + } } -variable "enable_eks_alb_logging" { - description = "Turn EKS ALB logging on" - type = bool - default = false -} +variable "argocd" { + type = object({ + enabled = bool + enableWebTerminal = bool + enableNotifications = bool + image_updater = map(any) + rollouts = object({ + enabled = bool + dashboard = map(any) + }) + }) + default = { + enabled = true -variable "eks_alb_logging_prefix" { - description = "Turn EKS ALB logging on" - type = string - default = "" -} + enableWebTerminal = true + enableNotifications = false -#==================================# -# Ingress Monitor Controller (IMC) # -#==================================# -variable "imc" { - type = any - default = {} -} + image_updater = { + enabled = false + } -#==================================# -# Backups # -#==================================# -variable "schedules" { - type = any - default = {} -} + rollouts = { + enabled = false -#==================================# -# DataDog Agent # -#==================================# -variable "enable_datadog_agent" { - type = bool - default = false + dashboard = { + enabled = false + } + } + } } variable "cost_optimization" { - type = any - default = {} -} - -#==================================# -# Uptime Kuma # -#==================================# -variable "enable_uptime_kuma" { - type = bool - default = false -} - -#==================================# -# KWatch # -#==================================# -variable "enable_kwatch" { - type = bool - default = false -} - -#==================================# -# enable_prometheus_stack -#==================================# -variable "enable_prometheus_stack" { - type = bool - default = false + type = map(any) + default = { + kube_resource_report = false + cost_analyzer = false + } } -#==================================# -# enable_keda and keda http add on -#==================================# -variable "enable_keda" { - type = bool - default = false -} -variable "enable_keda_http_add_on" { - type = bool - default = false +variable "keda" { + type = object({ + enabled = bool + http_add_on = map(any) + }) + default = { + enabled = false + http_add_on = { + enabled = false + } + } } diff --git a/apps-devstg/us-east-1/k8s-eks-demoapps/network/config.tf b/apps-devstg/us-east-1/k8s-eks-demoapps/network/config.tf index 9efcf62ce..a5c7df304 100644 --- a/apps-devstg/us-east-1/k8s-eks-demoapps/network/config.tf +++ b/apps-devstg/us-east-1/k8s-eks-demoapps/network/config.tf @@ -16,7 +16,7 @@ provider "aws" { # Backend Config (partial) # terraform { - required_version = "~> 1.2" + required_version = "~> 1.6" required_providers { aws = "~> 4.11" diff --git a/apps-devstg/us-east-1/k8s-eks/k8s-components/chart-values/kube-prometheus-stack.yaml b/apps-devstg/us-east-1/k8s-eks/k8s-components/chart-values/kube-prometheus-stack.yaml index fb827f24c..28f5bbd0e 100644 --- a/apps-devstg/us-east-1/k8s-eks/k8s-components/chart-values/kube-prometheus-stack.yaml +++ b/apps-devstg/us-east-1/k8s-eks/k8s-components/chart-values/kube-prometheus-stack.yaml @@ -61,7 +61,7 @@ alertmanager: annotations: kubernetes.io/tls-acme: 'true' kubernetes.io/ingress.class: ${ingressClass} - cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer + cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer hosts: - ${alertmanagerHost} path: / @@ -99,7 +99,7 @@ grafana: annotations: kubernetes.io/tls-acme: 'true' kubernetes.io/ingress.class: ${ingressClass} - cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer + cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer hosts: - ${grafanaHost} path: / @@ -141,7 +141,7 @@ prometheus: annotations: kubernetes.io/tls-acme: 'true' kubernetes.io/ingress.class: ${ingressClass} - cert-manager.io/cluster-issuer: clusterissuer-arta-cert-manager-clusterissuer + cert-manager.io/cluster-issuer: clusterissuer-binbash-cert-manager-clusterissuer hosts: - ${prometheusHost} path: / diff --git a/build.env b/build.env index c44f462e0..1f02dad52 100644 --- a/build.env +++ b/build.env @@ -5,4 +5,4 @@ PROJECT=bb MFA_ENABLED=false # Terraform -TERRAFORM_IMAGE_TAG=1.3.5-0.2.1 +TERRAFORM_IMAGE_TAG=1.6.0-0.2.1 diff --git a/shared/us-east-1/secrets-manager/secrets.tf b/shared/us-east-1/secrets-manager/secrets.tf index 57f88ed24..2fd77c9a8 100644 --- a/shared/us-east-1/secrets-manager/secrets.tf +++ b/shared/us-east-1/secrets-manager/secrets.tf @@ -20,19 +20,6 @@ module "secrets" { secret_string = "INITIAL_VALUE" kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id }, - # "/notifications/alertmanager" = { - # description = "Slack webhook for Alertmanager notifications" - # recovery_window_in_days = 7 - # secret_string = "INITIAL_VALUE" - # kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id - # }, - # "/grafana/administrator" = { - # description = "Credentials for Grafana administrator user" - # recovery_window_in_days = 7 - # secret_string = "INITIAL_VALUE" - # kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id - # }, - # # This secret was created based on the centralized secrets approach and the naming conventions # defined here: https://binbash.atlassian.net/wiki/spaces/BDPS/pages/2425978910/Secrets+Management+Conventions @@ -43,6 +30,18 @@ module "secrets" { secret_string = "INITIAL_VALUE" kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id }, + "/devops/monitoring/alertmanager" = { + description = "Slack webhook for Alertmanager notifications" + recovery_window_in_days = 7 + secret_string = "INITIAL_VALUE" + kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id + }, + "/devops/monitoring/grafana/administrator" = { + description = "Credentials for Grafana administrator user" + recovery_window_in_days = 7 + secret_string = "INITIAL_VALUE" + kms_key_id = data.terraform_remote_state.keys.outputs.aws_kms_key_id + }, } tags = local.tags