helm-charts/support/values.yaml

# Configuration of dependency charts
# -------------------------------------------------------------------------------
#

# cluster-autoscaler is responsible for understanding if k8s nodes needs to be
# added or removed. A pending pod is a sign that another node should be added,
# and an underused node is a sign that a node should be removed.
#
# We always need a cluster-autoscaler, but only deploy it ourselves if its not
# provided as part of the k8s cluster.
#
# values ref: https://github.com/kubernetes/autoscaler/blob/master/charts/cluster-autoscaler/values.yaml
#
cluster-autoscaler:
  enabled: false

# ingress-nginx is responsible for proxying traffic based on k8s Ingress
# resources defined in the k8s cluster.
#
# Typically, all inbound traffic arrives to some ingress-nginx controller pod
# via a k8s Service that has a public IP, and is thereafter proxied to a k8s
# Service (a Pod selected by a k8s Service) that doesn't have a public IP.
#
# values ref: https://github.com/kubernetes/ingress-nginx/blob/main/charts/ingress-nginx/values.yaml
#
ingress-nginx:
  controller:
    podLabels:
      # nginx-ingress controllers need to be allowed proxy traffic onwards to
      # JupyterHub's proxy pod (and only the proxy pod) in clusters with
      # NetworkPolicy enforcement enabled. Adding this label on the controller
      # pod allows that.
      #
      # ref: https://z2jh.jupyter.org/en/stable/administrator/security.html#introduction-to-the-chart-s-four-network-policies
      #
      hub.jupyter.org/network-access-proxy-http: "true"
    resources:
      requests:
        cpu: 100m # chart's default is 100m
        # This pod has been observed at 104-131Mi so far in 2i2c clusters, above
        # the default request of 90Mi. To protect it from being evicted by using
        # more memory than requested, we have increased the memory request.
        memory: 250Mi # chart's default is 90Mi
    # allowSnippetAnnotations being set to false by default was the result of
    # the below linked CVE. We should investigate if there are other ways we can
    # use this type of config.
    # https://github.com/kubernetes/ingress-nginx/issues/7837
    allowSnippetAnnotations: true
    admissionWebhooks:
      # Disabled as cert-manager in EKS 1.30 ran into the following issue
      # working with Ingress objects with the webhook not working out, with an
      # error message including this:
      #
      #   err="Internal error occurred: failed calling webhook \"validate.nginx.ingress.kubernetes.io\": failed to call webhook: Post \"https://support-ingress-nginx-controller-admission.support.svc:443/networking/v1/ingresses?timeout=10s\": tls: failed to verify certificate: x509: certificate signed by unknown authority"
      #
      # Disabling this shouldn't be an issue, it just helps catch invalid
      # configurations beyond what the k8s api-server will catch.
      #
      enabled: false

# prometheus is responsible for collecting metrics and informing grafana about
# the metrics on request. It comes with several dependency charts, where we
# opt-out of a few.
#
# values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus/values.yaml
#
prometheus:
  # alertmanager is an optional prometheus chart dependency that we opt-out from
  # as we favor Grafana for this functionality. Grafana provides alerts and does
  # so with a better UI that we expose publicly behind auth anyhow.
  #
  alertmanager:
    enabled: false

  # prometheus-pushgateway is an optional prometheus chart dependency that we
  # opt-out from. pushgateway provides a way to complement prometheus server's
  # behavior of scraping metrics from services by allowing services to push
  # metrics to prometheus.
  #
  prometheus-pushgateway:
    enabled: false

  # kube-state-metrics is an optional prometheus chart dependency that we rely
  # on to collect metrics about the k8s cluster's pods and nodes as reported by
  # the k8s api-server.
  #
  # values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml
  #
  kube-state-metrics:
    # kube-state-metrics stopped collecting *all* labels on all kubernetes objects as
    # prometheus labels, as it exploded the total number of metrics being collected. We
    # have to explicitly allow-list the labels we want. These are the ones needed for
    # the graphs from https://github.com/jupyterhub/grafana-dashboards to work
    metricLabelsAllowlist:
      - pods=[app,component,hub.jupyter.org/username,app.kubernetes.io/component]
      - nodes=[*]
      - services=[app, component]

  # prometheus-node-exporter is an optional prometheus chart dependency that we
  # rely on to collect metrics about the nodes
  #
  # values ref: https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml
  #
  prometheus-node-exporter:
    # resources for the node-exporter was set after inspecting cpu and memory
    # use via prometheus and grafana.
    #
    # node-exporter is typically found using between 0-3m CPU and 2-22Mi memory,
    # but we've seen it fail to report cpu/memory use metrics from time to time
    # when requesting and limiting to 5m, so we've increased requests/limit it
    # to 10m.
    #
    # PromQL queries for CPU and memory use:
    # - CPU:    sum(rate(container_cpu_usage_seconds_total{container="node-exporter", namespace="support"}[5m])) by (pod)
    # - Memory: sum(container_memory_usage_bytes{container="node-exporter", namespace="support"}) by (pod)
    #
    resources:
      limits:
        cpu: 10m
        memory: 30Mi
      requests:
        cpu: 10m
        memory: 30Mi

  networkPolicy:
    enabled: true

  server:
    # Prometheus sometimes seems to run into bugs where the 'write ahead log'
    # used for recovery from crashes and network failures gets massive, and
    # requires an order of magnitude more RAM to start up than usual. This could
    # have been greatly improved in prometheus 2.45 which is used as of
    # 2023-06-28, but we stick with a workaround for now until other users have
    # concluded things have greatly improved for them in prometheus 2.45+.
    #
    # ref: https://github.com/prometheus/prometheus/issues/6934#issuecomment-1610921555
    #
    # We are deleting the WAL at each startup now, and it has made us lose some
    # data during restarts, but that is preferable to having the entire
    # prometheus server be down.
    #
    # We can't use initContainers here, as they are not run when a container
    # restarts - only when the pod itself is redone. And unfortunately there
    # isn't a simple script we can run that will pass through the args to
    # /bin/prometheus after running the rm. So we improvise here - this is
    # the commandline of what prometheus is actually executing, copied over.
    # This must be revalidated with each bump of the prometheus helm chart!
    #
    # IMPORTANT: When upgrading the prometheus chart we need to manually adjust
    #            to changes to...
    #
    #            - The Deployment resource template's container args:
    #              https://github.com/prometheus-community/helm-charts/blob/41a8b07fb3477be61a58ad9acd8879625eabf0d0/charts/prometheus/templates/deploy.yaml#L129-L163
    #
    #            - The default values of the config referenced in the template:
    #              https://github.com/prometheus-community/helm-charts/blob/41a8b07fb3477be61a58ad9acd8879625eabf0d0/charts/prometheus/values.yaml
    #
    #              In the commit 41a8b07f linked above, this config was
    #              referenced and defaulted to:
    #
    #              server.retention - "15d"
    #              server.retentionSize - ""
    #              server.configPath - "/etc/config/prometheus.yml"
    #              server.storagePath - ""
    #              server.persistentVolume.mountPath - "/data"
    #              server.extraFlags - ["web.enable-lifecycle"]
    #              server.extraArgs - {}
    #              server.prefixURL - ""
    #              server.baseURL - ""
    #
    #              Currently our adjustments from the defaults are to
    #              server.retention and server.extraFlags configured
    #              directly below server.defaultFlagsOverride.
    #
    command:
      - /bin/sh
    defaultFlagsOverride:
      - -c
      # If changing this, please also find wherever this has been copied to and change
      # those as well as appropriate
      - "rm -rvf /data/wal/* && \
        exec /bin/prometheus \
        --storage.tsdb.retention.time=1098d \
        --config.file=/etc/config/prometheus.yml \
        --storage.tsdb.path=/data \
        --web.console.libraries=/etc/prometheus/console_libraries \
        --web.console.templates=/etc/prometheus/consoles \
        --web.enable-lifecycle"
    # extraFlags MUST BE UPDATED in prometheus.server.defaultFlagsOverride as well
    extraFlags:
      - web.enable-lifecycle
    # retention MUST BE UPDATED in prometheus.server.defaultFlagsOverride as well
    retention: 1098d # Keep data for at least 3 years

    ingress:
      ingressClassName: nginx
      annotations:
        # Annotations required to enable basic authentication for any ingress
        # into prometheus server from the outside world. This secret is
        # created via templates/prometheus-ingress-auth/secret.yaml file in the support chart,
        # and the contents are controlled by config under prometheusIngressAuthSecret.
        # Ingress is not enabled by default, so whichever clusters we want
        # this we should enable it explicitly.
        nginx.ingress.kubernetes.io/auth-type: basic
        nginx.ingress.kubernetes.io/auth-secret: prometheus-ingress-auth-basic
        nginx.ingress.kubernetes.io/auth-realm: "Authentication Required"
        # If we enable external ingress into prometheus, we must secure it with HTTPS
        cert-manager.io/cluster-issuer: letsencrypt-prod
        # Increase timeout for each query made by the grafana frontend to the
        # grafana backend to 2min, which is the default timeout for prometheus
        # queries. This also matches the timeout for the dataproxy in grafana,
        # set under `grafana.grafana.ini` below. These two timeouts are set together
        # to allow prometheus the best chance of executing queries we care about.
        # While grafana in the same cluster does not go through nginx to access
        # prometheus, the central grafana used does go through nginx. So we
        # increase the timeout here as well.
        nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
    strategy:
      # type is set to Recreate as we have a persistent disk attached. The
      # default of RollingUpdate would fail by getting stuck waiting for a new
      # pod trying to mount the same storage that only supports being mounted by
      # one pod.
      type: Recreate
    resources:
      # Prometheus cpu/memory requests/limits needs to be carefully considered based on:
      #
      # CPU:
      #
      #   During startup prometheus-server can compete for ~all of the node's
      #   CPU unless its bounded by a limit. The key to avoiding this isn't to
      #   provide a low limit as what's low is relative to the node's capacity,
      #   which could for example be 2, 4, or 8 CPU.
      #
      #   The avoid starving other pods, the key is to have a sufficiently low
      #   CPU request so that this pod doesn't get far higher priority to use
      #   the available CPU on the node when in competition with the other pods
      #   for CPU.
      #
      # Memory:
      #
      #   During startup prometheus-server will use up a lot of memory, up to X
      #   where X grows are more metrics has been collected. Over time,
      #   prometheus-server may run into its memory limit and get OOMKilled, or
      #   the node it runs on may run out of memory before that happens and
      #   prometheus-server or other pods exceeding their memory limit will get
      #   evicted.
      #
      #   To fail reliably in prometheus-server without influencing other pods,
      #   we should put the memory request to match the limit.
      #
      requests:
        cpu: 0.1
        memory: 4Gi
      limits:
        cpu: 4
        memory: 4Gi
    # prometheus-server may take a long time to startup when loading large
    # amounts of previously scraped data. To avoid having a livenessProbe
    # terminate the starting pod, we opt-in to a more tolerant startupProbe that
    # is active instead of the livenessProbe during startup.
    startupProbe:
      enabled: true
      periodSeconds: 10
      failureThreshold: 60
    podLabels:
      # To scrape metrics from a hub pod, prometheus-server needs to be
      # explicitly allowed network access to hub pods in clusters with
      # NetworkPolicy enforcement enabled. Adding this label does the trick.
      #
      # ref: https://z2jh.jupyter.org/en/stable/administrator/security.html#introduction-to-the-chart-s-four-network-policies
      #
      hub.jupyter.org/network-access-hub: "true"
    persistentVolume:
      size: 400Gi
    service:
      type: ClusterIP

# grafana presents dashboards based on metrics from a prometheus server.
#
# We setup dashboards programmatically via a grafana REST API based on
# https://github.com/jupyterhub/grafana-dashboards.
#
# values ref: https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
#
grafana:
  # We decrypt and pass `grafana.adminPassword` just in time as part of
  # deploying grafana, and find it acceptable - so we disable this strict check.
  assertNoLeakedSecrets: false

  persistence:
    # A PVC is used to enable Grafana to store auth details and dashboard
    # definitions.
    enabled: true
    # size could probably be smaller as not much space should be required, but
    # the chart default of 10Gi has made it a bit tricky to reduce it
    # retroactively.
    size: 10Gi
  deploymentStrategy:
    # type Recreate is required since we attach a PVC that can only be used by
    # mounted for writing by one pod at the time.
    type: Recreate
  readinessProbe:
    # With one grafana pod replica, having a readiness probe fail is pointless.
    # We ensure it won't fail before the livenessProbe that would restart the
    # container.
    failureThreshold: 1000
    initialDelaySeconds: 1

  rbac:
    # namespaced makes us not get ClusterRole service accounts etc, and we do
    # fine without it.
    namespaced: true

  # initChownData refers to an init container enabled by default that isn't
  # needed as we don't reconfigure the linux user the grafana server will run
  # as.
  initChownData:
    enabled: false

  # resources for grafana was set after inspecting cpu and memory use via
  # prometheus and grafana.
  #
  # Grafana's memory use seems to increase over time but seems reasonable to
  # stay below 200Mi in general. Memory can peak when dashboards are updated,
  # and was increased to 400Mi as its been seen getting OOMKilled. Grafana's CPU
  # use seems minuscule with peaks at up to 9m CPU from one user is browsing its
  # dashboards.
  #
  # PromQL queries for CPU and memory use:
  # - CPU:    sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod)
  # - Memory: sum(container_memory_usage_bytes{container="grafana", namespace="support"}) by (pod)
  #
  resources:
    limits:
      cpu: 100m
      memory: 400Mi
    requests:
      cpu: 10m
      memory: 200Mi

  service:
    type: ClusterIP
  ingress:
    enabled: true
    ingressClassName: nginx
    annotations:
      # nginx.ingress.kubernetes.io/proxy-body-size increased from the default
      # of 1m to avoid HTTP 413 (Request Entity Too Large) when making POST
      # request (via jupyterhub/grafana-dashboard's deploy script, from a github
      # workflow) to update dashboards with json files representing them.
      nginx.ingress.kubernetes.io/proxy-body-size: 64m
      # Increase timeout for each query made by the grafana frontend to the
      # grafana backend to 2min, which is the default timeout for prometheus
      # queries. This also matches the timeout for the dataproxy in grafana,
      # set under `grafana.ini` below. These two timeouts are set together
      # to allow prometheus the best chance of executing queries we care about.
      nginx.ingress.kubernetes.io/proxy-read-timeout: "120"
      cert-manager.io/cluster-issuer: letsencrypt-prod

  # grafana is partially configured for GitHub authentication here, but the
  # following values need to be set as well, where client_secret should be kept
  # secret:
  #
  #     server:
  #       root_url: https://grafana.<cluster_name>.2i2c.cloud/
  #     auth.github:
  #       enabled: true
  #       allowed_organizations: "2i2c-org some-other-gh-org"
  #       client_id: ""
  #       client_secret: ""
  #
  # grafana.ini ref: https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/
  #
  grafana.ini:
    # dataproxy is used to make requests to prometheus via the backend.
    # This allows authless access to prometheus server in the same namespace.
    dataproxy:
      # Enable logging so we can debug grafana timeouts
      logging: true
      # Default prometheus query timeout is 120s, so let's allow grafana to
      # wait until that much time.
      # See https://prometheus.io/docs/prometheus/latest/command-line/prometheus/
      # for default prometheus query timeouts.
      # Because our grafana is behind an nginx ingress, this should match the
      # read timeout set above under `ingress.annotations`.
      timeout: 120
    unified_alerting:
      # Disable alerting for now as its not something our community users can
      # setup. See https://github.com/2i2c-org/infrastructure/issues/4919.
      enabled: false
    server:
      root_url: ""
      enable_gzip: true
    auth.github:
      enabled: false
      allowed_organizations: ""
      client_id: ""
      client_secret: ""
      allow_sign_up: true
      scopes: user:email,read:org
      auth_url: https://github.com/login/oauth/authorize
      token_url: https://github.com/login/oauth/access_token
      api_url: https://api.github.com/user

  plugins:
    - yesoreyeram-infinity-datasource

  datasources:
    datasources.yaml:
      apiVersion: 1
      datasources:
        # Automatically add the prometheus server in the same namespace as the grafana as a datasource
        - name: prometheus
          orgId: 1
          type: prometheus
          # This is the name of the kubernetes service exposed by the prometheus server
          url: http://support-prometheus-server
          access: proxy
          isDefault: false
          editable: false
        - name: yesoreyeram-infinity-datasource
          type: yesoreyeram-infinity-datasource
          isDefault: false
          editable: true

# cryptnono kills processes attempting to mine the cryptocurrency Monero in the
# k8s cluster.
#
# values ref: https://github.com/yuvipanda/cryptnono/blob/main/cryptnono/values.yaml
#
cryptnono:
  enabled: true
  # resources for cryptnono was set after inspecting cpu and memory use via
  # prometheus and grafana.
  #
  # cryptnono has an init container (fetch-kernel-headers) and one container per
  # detector. We currently only use one detector (monero).
  #
  # In the past, the init container init container has been found using up to 1.6Gi and up to about
  # 600m for 4 minutes. However, recent changes seem to have made this much faster,
  # and there's no record of the initcontainer because our prometheus scrape interval
  # is 1minute, and the init container seems to complete by then. We retain the older
  # measured metrics until we can make new measurements.
  #
  # Since cryptnono is a non-critical service, we are at the moment allowing it
  # to be evicted during node memory pressure by providing a low memory request
  # compared to the limit. We are also not requesting significant amounts of CPU
  # so that it doesn't compete well with others initially.
  fetchKernelHeaders:
    resources:
      limits:
        cpu: 800m
        memory: 2Gi
      requests:
        cpu: 5m
        memory: 100Mi

  image:
    # Brings in https://github.com/cryptnono/cryptnono/pull/33
    # FIXME: Unset this once that gets merged and we update to the newest cryptnono version
    tag: 0.3.1-0.dev.git.138.ha4b11b8

  detectors:
    execwhacker:
      enabled: true
      # Cryptnono can get information about what container the kill was in, so we can identify which
      # user was cryptomining
      containerdHostPath: /run/containerd/containerd.sock
      metrics:
        enabled: true
      configs:
        unencrypted-test-01:
          bannedCommandStrings:
            # Provide an unencrypted, randomly generated test string. All processes that have this in their commandline will be killed.
            # This is helpful for testing execwhacker is enabled and functional
            - beiquatohGa1uay0ahMies9couyahPeiz9xohju3Ahvaik3FaeM7eey1thaish1U
    monero:
      # disable the monero specific detector, as execwhacker catches most of that too
      # monero detector is using bpftrace, which has issues with cross-kernel compatibility
      enabled: false
      resources:
        # Measured with the following prometheus queries:
        # Memory: sum(container_memory_usage_bytes{container="monero", namespace="support"}) by (instance)
        # CPU: sum(rate(container_cpu_usage_seconds_total{container="trace", namespace="support"}[5m])) by (instance)
        # Seems to hover mostly around the 60Mi mark for memory, and generally less than 0.0002 in CPU. But
        # 1m (or 0.001) is the lowest that can be specified in kubernetes, so we use that.
        limits:
          memory: 128Mi
          cpu: 5m
        requests:
          memory: 64Mi
          cpu: 1m

# aws-ce-grafana-backend exposes AWS Cost Explorer API info to Grafana
#
# values ref: https://github.com/2i2c-org/infrastructure/blob/main/helm-charts/aws-ce-grafana-backend/values.yaml
#
aws-ce-grafana-backend:
  enabled: false

# Configuration of templates provided directly by this chart
# -------------------------------------------------------------------------------
#

prometheusIngressAuthSecret:
  enabled: false
  username: ""
  password: ""

redirects:
  rules: []

# Enable a daemonset to install nvidia device plugin to GPU nodes
# Not necessary on GCP & AWS don't need this, as it is handled automatically by terraform or eksctl
# respectively
nvidiaDevicePlugin:
  azure:
    enabled: false

# Setup a separate storageClass specifically for prometheus data
prometheusStorageClass:
  gke:
    # Defaults to false, until all GKE clusters have been manually
    # migrated. Could default to true after that.
    enabled: false
    # pd-balanced is SSD backed, much faster than spinning standard disks and
    # cheaper than pd-ssd. We add the -retain to indicate the retainPolicy
    # of Retain, rather than the default of Delete
    name: balanced-rwo-retain
    parameters:
      type: pd-balanced

# Setup a deployment that will periodically backup the Filestore contents
gcpFilestoreBackups:
  enabled: false

# A placeholder as global values that can be referenced from the same location
# of any chart should be possible to provide, but aren't necessarily provided or
# used.
global: {}