Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: metrics via terraform #10594

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions .github/workflows/metrics-deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
name: Aztec Metrics Stack Deployment

on:
workflow_call:
inputs:
namespace:
description: The namespace to deploy to, e.g. metrics
required: true
type: string
default: metrics
values_file:
description: The values file to use, e.g. 1-validators.yaml
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a nit but presumably this example should be one of the metrics values files?

required: true
type: string
default: "prod.yaml"
respect_tf_lock:
description: Whether to respect the Terraform lock
required: false
type: string
default: "true"
run_terraform_destroy:
description: Whether to run terraform destroy before deploying
required: false
type: string
default: "false"
ref:
description: The branch name to deploy from
required: false
type: string
default: "master"
secrets:
GCP_SA_KEY:
required: true
workflow_dispatch:
inputs:
namespace:
description: The namespace to deploy to, e.g. metrics
required: true
default: metrics
values_file:
description: The values file to use, e.g. prod.yaml
required: true
default: "prod.yaml"
respect_tf_lock:
description: Whether to respect the Terraform lock
required: false
default: "true"
run_terraform_destroy:
description: Whether to run terraform destroy before deploying
required: false
default: "false"
ref:
description: The branch name to deploy from
required: false
default: "master"

jobs:
metrics_deployment:
# This job will run on Ubuntu
runs-on: ubuntu-latest
concurrency:
group: deploy-${{ github.ref }} # Only one job per branch
cancel-in-progress: false # Allow previous deployment to complete to avoid corruption

# Set up a variable based on the branch name
env:
NAMESPACE: ${{ inputs.namespace }}
VALUES_FILE: ${{ inputs.values_file }}
CHART_PATH: ./spartan/metrics
CLUSTER_NAME: aztec-gke
REGION: us-west1-a
TF_STATE_BUCKET: aztec-terraform
GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke

steps:
- name: Checkout code
uses: actions/checkout@v3
with:
ref: ${{ inputs.ref }}

- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}

- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2

- name: Install GKE Auth Plugin
run: |
gcloud components install gke-gcloud-auth-plugin --quiet

- name: Configure kubectl with GKE cluster
run: |
gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }}

- name: Ensure Terraform state bucket exists
run: |
if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then
echo "Creating GCS bucket for Terraform state..."
gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }}
gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }}
else
echo "Terraform state bucket already exists"
fi

- name: Setup Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: "1.5.0" # Specify your desired version

- name: Terraform Init
working-directory: ./spartan/terraform/deploy-metrics
run: |
terraform init \
-backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \
-backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate"

- name: Terraform Destroy
working-directory: ./spartan/terraform/deploy-metrics
if: ${{ inputs.run_terraform_destroy == 'true' }}
# Destroy fails if the resources are already destroyed, so we continue on error
continue-on-error: true
run: |
terraform destroy -auto-approve \
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
-lock=${{ inputs.respect_tf_lock }}

- name: Terraform Plan
working-directory: ./spartan/terraform/deploy-metrics
run: |
terraform plan \
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
-out=tfplan \
-lock=${{ inputs.respect_tf_lock }}

- name: Terraform Apply
working-directory: ./spartan/terraform/deploy-metrics
run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan
16 changes: 1 addition & 15 deletions spartan/metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,6 @@ opentelemetry-collector:
kubernetesAttributes:
enabled: true
config:
exporters:
# debug:
# verbosity: detailed
otlphttp/logs:
endpoint: http://metrics-loki.metrics:3100/otlp
otlp/tempo:
endpoint: http://metrics-tempo.metrics:4317
tls:
insecure: true
prometheus:
endpoint: ${env:MY_POD_IP}:8889
metric_expiration: 5m
resource_to_telemetry_conversion:
enabled: true
extensions:
health_check:
endpoint: ${env:MY_POD_IP}:13133
Expand Down Expand Up @@ -91,7 +77,7 @@ opentelemetry-collector:
# - debug

# Enable and configure the Loki subchart
# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable
# https://artifacthub.io/packages/helm/grafana/loki
# loki:
# Nothing set here, because we need to use values from the values directory;
# otherwise, things don't get overridden correctly.
Expand Down
34 changes: 34 additions & 0 deletions spartan/metrics/values/kind.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
opentelemetry-collector:
config:
exporters:
# debug:
# verbosity: detailed
otlphttp/logs:
endpoint: http://metrics-loki.metrics:3100/otlp
otlp/tempo:
endpoint: http://metrics-tempo.metrics:4317
tls:
insecure: true
prometheus:
endpoint: ${env:MY_POD_IP}:8889
metric_expiration: 5m
resource_to_telemetry_conversion:
enabled: true

loki:
deploymentMode: SingleBinary
loki:
Expand All @@ -23,3 +40,20 @@ loki:
replicas: 0
write:
replicas: 0

grafana:
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Loki
type: loki
url: http://metrics-loki.metrics:3100
- name: Tempo
type: tempo
url: http://metrics-tempo.metrics:3100
- name: Prometheus
type: prometheus
uid: spartan-metrics-prometheus
isDefault: true
url: http://metrics-prometheus-server.metrics:80
51 changes: 38 additions & 13 deletions spartan/metrics/values/prod.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
# Enable and configure Grafana
# https://artifacthub.io/packages/helm/grafana/grafana
grafana:
service:
type: LoadBalancer
persistence:
type: pvc
enabled: true
size: "10Gi"

opentelemetry-collector:
ports:
jaeger-compact:
enabled: false
service:
enabled: true
type: LoadBalancer
config:
exporters:
# debug:
# verbosity: detailed
otlphttp/logs:
endpoint: http://loki-write.metrics:3100/otlp
otlp/tempo:
endpoint: http://metrics-tempo.metrics:4317
tls:
insecure: true
prometheus:
endpoint: ${env:MY_POD_IP}:8889
metric_expiration: 5m
resource_to_telemetry_conversion:
enabled: true

loki:
loki:
Expand Down Expand Up @@ -58,6 +63,26 @@ loki:
persistence:
size: 64Gi

gateway:
service:
type: LoadBalancer
# https://artifacthub.io/packages/helm/grafana/grafana
grafana:
service:
type: LoadBalancer
persistence:
type: pvc
enabled: true
size: "10Gi"
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Loki
type: loki
url: http://loki-read.metrics:3100
- name: Tempo
type: tempo
url: http://metrics-tempo.metrics:3100
- name: Prometheus
type: prometheus
uid: spartan-metrics-prometheus
isDefault: true
url: http://metrics-prometheus-server.metrics:80
1 change: 1 addition & 0 deletions spartan/terraform/deploy-metrics/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

53 changes: 53 additions & 0 deletions spartan/terraform/deploy-metrics/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
terraform {
backend "gcs" {
bucket = "aztec-terraform"
prefix = "terraform/state"
}
required_providers {
helm = {
source = "hashicorp/helm"
version = "~> 2.16.1"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.24.0"
}
}
}

provider "kubernetes" {
alias = "gke-cluster"
config_path = "~/.kube/config"
config_context = var.GKE_CLUSTER_CONTEXT
}

provider "helm" {
alias = "gke-cluster"
kubernetes {
config_path = "~/.kube/config"
config_context = var.GKE_CLUSTER_CONTEXT
}
}

# Aztec Helm release for gke-cluster
resource "helm_release" "aztec-gke-cluster" {
provider = helm.gke-cluster
name = var.RELEASE_NAME
repository = "../../"
chart = "metrics"
namespace = var.RELEASE_NAME
create_namespace = true
upgrade_install = true
dependency_update = true
force_update = true

# base values file
values = [file("../../metrics/values/${var.VALUES_FILE}")]


# Setting timeout and wait conditions
timeout = 1200 # 20 minutes in seconds
wait = true
wait_for_jobs = true

}
1 change: 1 addition & 0 deletions spartan/terraform/deploy-metrics/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

15 changes: 15 additions & 0 deletions spartan/terraform/deploy-metrics/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
variable "GKE_CLUSTER_CONTEXT" {
description = "GKE cluster context"
type = string
default = "gke_testnet-440309_us-east4-a_spartan-gke"
}

variable "RELEASE_NAME" {
description = "Name of helm deployment and k8s namespace"
type = string
}

variable "VALUES_FILE" {
description = "Name of the values file to use for deployment"
type = string
}
Loading