diff --git a/.configs/certificates.yaml b/.configs/certificates.yaml new file mode 100644 index 000000000..ac0643300 --- /dev/null +++ b/.configs/certificates.yaml @@ -0,0 +1,35 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: prometheus +--- +apiVersion: secretgen.k14s.io/v1alpha1 +kind: Certificate +metadata: + name: ca-cert + namespace: prometheus +spec: + isCA: true +--- +apiVersion: secretgen.k14s.io/v1alpha1 +kind: Certificate +metadata: + name: prometheus-ssl + namespace: prometheus +spec: + alternativeNames: + - prometheus-server.prometheus.svc.cluster.local + caRef: + name: ca-cert +--- +apiVersion: secretgen.k14s.io/v1alpha1 +kind: Certificate +metadata: + name: prometheus-workload-ssl + namespace: prometheus +spec: + alternativeNames: + - prometheus-workload-server.prometheus.svc.cluster.local + caRef: + name: ca-cert diff --git a/.github/configs/lintconf.yaml b/.configs/lintconf.yaml similarity index 100% rename from .github/configs/lintconf.yaml rename to .configs/lintconf.yaml diff --git a/.configs/loki.yaml b/.configs/loki.yaml new file mode 100644 index 000000000..ebe435107 --- /dev/null +++ b/.configs/loki.yaml @@ -0,0 +1,93 @@ +--- +loki: + commonConfig: + replication_factor: 1 + schemaConfig: + configs: + - from: 2024-04-01 + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: loki_index_ + period: 24h + ingester: + chunk_encoding: snappy + querier: + # Default is 4, if you have enough memory and CPU you can increase, reduce if OOMing + max_concurrent: 2 + +test: + enabled: false + +lokiCanary: + enabled: false + +gateway: + basicAuth: + enabled: true + username: loki + password: lokipassword + service: + port: 8080 + +deploymentMode: SingleBinary +singleBinary: + replicas: 1 +# resources: +# limits: +# cpu: 3 +# memory: 4Gi +# requests: +# cpu: 2 +# memory: 2Gi +# extraEnv: +# # Keep a little bit lower than memory limits +# - name: GOMEMLIMIT +# value: 3750MiB + +# Enable minio for storage +minio: + enabled: true + +# Zero out replica counts of other deployment modes +backend: + replicas: 0 +read: + replicas: 0 +write: + replicas: 0 + +ingester: + replicas: 0 +querier: + replicas: 0 +queryFrontend: + replicas: 0 +queryScheduler: + replicas: 0 +distributor: + replicas: 0 +compactor: + replicas: 0 +indexGateway: + replicas: 0 +bloomCompactor: + replicas: 0 +bloomGateway: + replicas: 0 +resultsCache: + enabled: false +chunksCache: + enabled: false + +monitoring: + selfMonitoring: + enabled: false + grafanaAgent: + installOperator: false + serviceMonitor: + enabled: true + # This actually isn't recommended by Loki, the default is 15s for a reason, but we don't want to upset + # our DPM test calculations. + interval: 1m diff --git a/.configs/prometheus.yaml b/.configs/prometheus.yaml new file mode 100644 index 000000000..6b6d45cd4 --- /dev/null +++ b/.configs/prometheus.yaml @@ -0,0 +1,49 @@ +--- +server: + extraFlags: + - enable-feature=otlp-write-receiver + - enable-feature=remote-write-receiver + - web.config.file=/etc/config/web.yml + + extraSecretMounts: + - name: prometheus-ssl + mountPath: /etc/prometheus-ssl + secretName: prometheus-ssl + readOnly: true + + persistentVolume: + enabled: false + + probeHeaders: + - name: "Authorization" + value: "Basic cHJvbXVzZXI6cHJvbWV0aGV1c3Bhc3N3b3Jk" + probeScheme: HTTPS + + service: + servicePort: 9090 + +serverFiles: + prometheus.yml: + scrape_configs: [] + web.yml: + basic_auth_users: + promuser: $2a$12$1UJsAG4QnhjjDzqcSVkZmeDxxjgIFOAmzfuVTybTuhhDnYgfuAbAq # "prometheuspassword" + tls_server_config: + cert_file: /etc/prometheus-ssl/crt.pem + key_file: /etc/prometheus-ssl/key.pem + +configmapReload: + prometheus: + enabled: false + +alertmanager: + enabled: false + +kube-state-metrics: + enabled: false + +prometheus-node-exporter: + enabled: false + +prometheus-pushgateway: + enabled: false diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..3370c24c3 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,16 @@ +# This file is used to define the owners of the code in this repository. +# https://help.github.com/articles/about-codeowners/ + +# Global owners +* @petewall + +# Chart owners +charts/feature-annotation-autodiscovery @grafana/k8s-monitoring-dev +charts/feature-application-observability @rlankfo +charts/feature-cluster-events @grafana/k8s-monitoring-dev +charts/feature-cluster-metrics @grafana/k8s-monitoring-dev +charts/feature-frontend-observability @rlankfo +charts/feature-pod-logs @grafana/k8s-monitoring-dev +charts/feature-profiling @simonswine +charts/feature-prometheus-operator-objects @grafana/k8s-monitoring-dev +charts/k8s-monitoring-v1 @grafana/k8s-monitoring-dev diff --git a/.github/workflows/helm-test.yml b/.github/workflows/helm-test.yml index 41f5069ec..446818550 100644 --- a/.github/workflows/helm-test.yml +++ b/.github/workflows/helm-test.yml @@ -15,7 +15,7 @@ on: env: CT_CONFIGFILE: "${{ github.workspace }}/.github/configs/ct.yaml" - LINT_CONFIGFILE: "${{ github.workspace }}/.github/configs/lintconf.yaml" + LINT_CONFIGFILE: "${{ github.workspace }}/.configs/lintconf.yaml" GRAFANA_ALLOY_VALUES: "${{ github.workspace }}/.github/configs/alloy-config.yaml" GRAFANA_ALLOY_LOKI_OTLP_VALUES: "${{ github.workspace }}/.github/configs/alloy-config-loki-otlp.yaml" GRAFANA_ALLOY_RECEIVER_SERVICE: "${{ github.workspace }}/.github/configs/receiver-service.yaml" @@ -212,4 +212,4 @@ jobs: if: (steps.list-changed.outputs.changed == 'true') || (contains(github.event.pull_request.labels.*.name, 'full_test_required')) run: | latestRelease=$(git describe --abbrev=0 --tags) - ct install --all --config "${CT_CONFIGFILE}" --since "${latestRelease}" --helm-extra-args "--timeout 10m" + ct install --config "${CT_CONFIGFILE}" --since "${latestRelease}" --helm-extra-args "--timeout 10m" --charts charts/k8s-monitoring-v1 diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml new file mode 100644 index 000000000..699fa2886 --- /dev/null +++ b/.github/workflows/integration-test.yml @@ -0,0 +1,56 @@ +--- +name: Integration Test +# yamllint disable-line rule:truthy +on: + push: + branches: ["main"] + paths: + - 'charts/**' + - '!charts/k8s-monitoring-v1/**' + pull_request: + paths: + - 'charts/**' + - '!charts/k8s-monitoring-v1/**' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + list-tests: + name: List tests + runs-on: ubuntu-latest + outputs: + tests: ${{ steps.list_tests.outputs.tests }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: List tests + id: list_tests + run: | + tests=$(ls charts/k8s-monitoring/tests/integration) + echo "Tests: ${tests}" + echo "tests=$(echo "${tests}" | jq --raw-input --slurp --compact-output 'split("\n") | map(select(. != ""))')" >> "${GITHUB_OUTPUT}" + + run-tests: + name: Integration Test + needs: list-tests + runs-on: ubuntu-latest + strategy: + matrix: + test: ${{ fromJson(needs.list-tests.outputs.tests) }} + fail-fast: false + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Helm + uses: azure/setup-helm@v4 + + - name: Create kind cluster + uses: helm/kind-action@v1 + + - name: Run test + run: | + echo "Testing ${{ matrix.test }}" + CREATE_CLUSTER=false ./scripts/integration-test.sh "charts/k8s-monitoring/tests/integration/${{ matrix.test }}" diff --git a/.github/workflows/reviewdog.yml b/.github/workflows/reviewdog.yml index 1aa1b3cf3..bdb9d677b 100644 --- a/.github/workflows/reviewdog.yml +++ b/.github/workflows/reviewdog.yml @@ -2,9 +2,13 @@ name: ReviewDog # yamllint disable-line rule:truthy on: - pull_request: + push: branches: ["main"] + pull_request: + + workflow_dispatch: + jobs: markdownlint: name: runner / markdownlint @@ -117,16 +121,6 @@ jobs: github_token: ${{ secrets.github_token }} reporter: github-check - eclint: - name: runner / eclint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: reviewdog/action-eclint@v1 - with: - github_token: ${{ secrets.github_token }} - reporter: github-check - textlint: name: runner / textlint runs-on: ubuntu-latest @@ -137,7 +131,7 @@ jobs: - env: REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - npx textlint --format checkstyle --config ./.textlintrc --ignore-path ./.textlintignore $(find . -type f -name "*.md" -not \( -path "./node_modules/*" -o -path "./data-alloy/*" \)) | \ + npx textlint --format checkstyle --config ./.textlintrc --ignore-path ./.textlintignore "$(find . -type f -name "*.md" -not \( -path "./node_modules/*" -o -path "./data-alloy/*" \))" | \ reviewdog -f=checkstyle -name="textlint" -reporter=github-check -level=info alloy: diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml new file mode 100644 index 000000000..7178e9cb5 --- /dev/null +++ b/.github/workflows/unit-test.yml @@ -0,0 +1,135 @@ +--- +name: Unit Test +# yamllint disable-line rule:truthy +on: + push: + branches: ["main"] + paths: + - 'charts/**' + - '!charts/k8s-monitoring-v1/**' + pull_request: + paths: + - 'charts/**' + - '!charts/k8s-monitoring-v1/**' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + + +jobs: + detect-changed-charts: + name: Detect Changed Charts + runs-on: ubuntu-latest + outputs: + changed_charts: ${{ steps.changed_charts.outputs.changed_charts }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Detect Changed Charts + id: changed_charts + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + # In pull request, compare against the base branch (upstream) + base_branch="${{ github.event.pull_request.base.ref }}" + echo "Comparing against base branch: $base_branch" + git fetch origin $base_branch + base_commit="origin/$base_branch" + elif [ "${{ github.event_name }}" == "push" ]; then + # In push to main, compare the last commit with HEAD^ + base_commit="HEAD^" + elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + # In manual trigger, run for all charts + echo "Manual dispatch detected, running tests for all charts" + # shellcheck disable=SC2010 + echo "changed_charts=$(ls charts | grep -v "k8s-monitoring-v1" | sort -u)" >> "${GITHUB_OUTPUT}" + exit 0 + fi + + # Check if base commit exists, fallback to empty tree if none + if ! git rev-parse --verify "$base_commit" >/dev/null 2>&1; then + base_commit=$(git hash-object -t tree /dev/null) + fi + + # Detect modified files + modified_charts=$(git diff --name-only "$base_commit" HEAD -- 'charts/*' | grep "^charts/" | cut -d "/" -f2 | sort -u) + + # Detect newly added files (untracked files) + added_charts=$(git ls-files --others --exclude-standard -- 'charts/*' | grep "^charts/" | cut -d "/" -f2 | sort -u) + + # Combine both added and modified charts + changed_charts=$(echo -e "$modified_charts\n$added_charts" | grep -v "k8s-monitoring-v1" | sort -u) + + if [ -z "$changed_charts" ]; then + echo "No changes detected in charts" + changed_charts="none" + fi + echo "Changed charts: $changed_charts" + echo "changed_charts=$(echo "$changed_charts" | jq --raw-input --slurp --compact-output 'split("\n") | map(select(. != ""))')" >> "${GITHUB_OUTPUT}" + + run-tests: + name: Testing + needs: detect-changed-charts + runs-on: ubuntu-latest + strategy: + matrix: + dir: ${{ fromJson(needs.detect-changed-charts.outputs.changed_charts) }} + fail-fast: false + if: ${{ needs.detect-changed-charts.outputs.changed_charts != 'none' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up chart-testing + uses: helm/chart-testing-action@v2 + + # Installing Grafana Alloy because we need it to lint the generated alloy config files. + # https://grafana.com/docs/alloy/latest/get-started/install/linux/ + - name: Install Grafana Alloy + run: | + sudo mkdir -p /etc/apt/keyrings/ + wget -q -O - https://apt.grafana.com/gpg.key | gpg --dearmor | sudo tee /etc/apt/keyrings/grafana.gpg > /dev/null + echo "deb [signed-by=/etc/apt/keyrings/grafana.gpg] https://apt.grafana.com stable main" | sudo tee /etc/apt/sources.list.d/grafana.list + sudo apt-get update + sudo apt-get install -y alloy + + - name: Run tests + run: | + echo "Testing ${{ matrix.dir }}" + cd charts/${{ matrix.dir }} + make test + + check-generated-files: + name: Check Generated Files + needs: detect-changed-charts + runs-on: ubuntu-latest + strategy: + matrix: + dir: ${{ fromJson(needs.detect-changed-charts.outputs.changed_charts) }} + fail-fast: false + if: ${{ needs.detect-changed-charts.outputs.changed_charts != 'none' }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Regenerate files + run: | + echo "Running make all in charts/${{ matrix.dir }}" + cd charts/${{ matrix.dir }} + make clean build + + - name: Check for changes in generated files + run: | + cd charts/${{ matrix.dir }} + if [ "${{ matrix.dir }}" == "k8s-monitoring" ]; then + # Skip checking subchart files for k8s-monitoring, which are always modified, even if the contents are identical + if ! git diff --exit-code -- ':!charts/*.tgz'; then + echo "Generated files in charts/${{ matrix.dir }} are not up to date. Please run 'make all' and commit the changes." + exit 1 + fi + elif ! git diff --exit-code .; then + echo "Generated files in charts/${{ matrix.dir }} are not up to date. Please run 'make all' and commit the changes." + exit 1 + else + echo "Generated files in charts/${{ matrix.dir }} are up to date." + fi diff --git a/.yamllint.yml b/.yamllint.yml index 83c2f2dfc..aed881e83 100644 --- a/.yamllint.yml +++ b/.yamllint.yml @@ -8,6 +8,7 @@ ignore: - .git - data-alloy - node_modules + - charts/k8s-monitoring/docs/examples/**/output.yaml - charts/k8s-monitoring-v1/docs/examples/**/output.yaml - charts/**/templates diff --git a/CODEOWNERS b/CODEOWNERS deleted file mode 100644 index f543606ae..000000000 --- a/CODEOWNERS +++ /dev/null @@ -1,7 +0,0 @@ -# https://help.github.com/articles/about-codeowners/ - -# https://git-scm.com/docs/gitignore#_pattern_format - -- @jewbetcha -- @petewall -- @skl diff --git a/Makefile b/Makefile index d72678a7f..f76de658b 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,28 @@ -.PHONY: setup install lint lint-chart lint-config lint-configs lint-alloy lint-sh lint-md lint-txt lint-yml lint-ec lint-alex lint-misspell lint-actionlint test install-deps clean SHELL := /bin/bash UNAME := $(shell uname) -CT_CONFIGFILE ?= .github/configs/ct.yaml -LINT_CONFIGFILE ?= .github/configs/lintconf.yaml +FEATURE_CHARTS = $(shell ls charts | grep -v k8s-monitoring) + +.PHONY: build +build: + set -e && \ + for chart in $(FEATURE_CHARTS); do \ + make -C charts/$$chart build; \ + done + make -C charts/k8s-monitoring build + +.PHONY: test +test: build + set -e && \ + for chart in $(FEATURE_CHARTS); do \ + make -C charts/$$chart test; \ + done + make -C charts/k8s-monitoring test #################################################################### # Installation / Setup # #################################################################### +.PHONY: setup install-deps setup install-deps: ifeq ($(UNAME), Darwin) @./scripts/setup.sh @@ -16,51 +31,44 @@ else exit 1 endif +.PHONY: install install: yarn install +.PHONY: clean clean: rm -rf node_modules #################################################################### # Linting # #################################################################### -lint: lint-chart lint-config lint-sh lint-md lint-txt lint-yml lint-ec lint-alex lint-misspell lint-actionlint - -lint-chart: - ct lint --debug --config "$(CT_CONFIGFILE)" --lint-conf "$(LINT_CONFIGFILE)" --check-version-increment=false +.PHONY: lint lint-sh lint-md lint-txt lint-yml lint-alex lint-misspell lint-actionlint +lint: lint-sh lint-md lint-txt lint-yml lint-alex lint-misspell lint-actionlint -lint-config lint-configs lint-alloy: - @./scripts/lint-alloy.sh $(METRICS_CONFIG_FILES) $(EVENTS_CONFIG_FILES) $(LOGS_CONFIG_FILES) --public-preview $(PROFILES_CONFIG_FILES) - -# Shell Linting +# Shell Linting for checking shell scripts lint-sh lint-shell: @./scripts/lint-shell.sh || true -# Markdown Linting +# Markdown Linting for checking markdown files lint-md lint-markdown: @./scripts/lint-markdown.sh || true -# Text Linting +# Text Linting for checking text files lint-txt lint-text: @./scripts/lint-text.sh || true -# Yaml Linting +# Yaml Linting for checking yaml files lint-yml lint-yaml: @./scripts/lint-yaml.sh || true -# Editorconfig Linting -lint-ec lint-editorconfig: - @./scripts/lint-editorconfig.sh || true - -# Alex Linting +# Alex Linting for checking insensitive language lint-alex: @./scripts/lint-alex.sh || true -# Misspell Linting +# Misspell Linting for checking common spelling mistakes lint-misspell: @./scripts/lint-misspell.sh || true -# Actionlint Linting +# Actionlint Linting for checking GitHub Actions lint-al lint-actionlint: @./scripts/lint-actionlint.sh || true diff --git a/charts/feature-annotation-autodiscovery/.helmignore b/charts/feature-annotation-autodiscovery/.helmignore new file mode 100644 index 000000000..2b29eaf56 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/.helmignore @@ -0,0 +1,6 @@ +docs +schema-mods +tests +Makefile +README.md +README.md.gotmpl diff --git a/charts/feature-annotation-autodiscovery/Chart.lock b/charts/feature-annotation-autodiscovery/Chart.lock new file mode 100644 index 000000000..b89222ee5 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/Chart.lock @@ -0,0 +1,3 @@ +dependencies: [] +digest: sha256:643d5437104296e21d906ecb15b2c96ad278f20cfc4af53b12bb6069bd853726 +generated: "2024-09-25T13:45:54.706765-05:00" diff --git a/charts/feature-annotation-autodiscovery/Chart.yaml b/charts/feature-annotation-autodiscovery/Chart.yaml new file mode 100644 index 000000000..2de14ccb7 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/Chart.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v2 +name: k8s-monitoring-feature-annotation-autodiscovery +description: Gathers metrics automatically based on Kubernetes Pod and Service annotations +icon: https://raw.githubusercontent.com/grafana/grafana/main/public/img/grafana_icon.svg +version: 1.0.0 +appVersion: 1.0.0 +maintainers: + - email: pete.wall@grafana.com + name: petewall +dependencies: [] diff --git a/charts/feature-annotation-autodiscovery/Makefile b/charts/feature-annotation-autodiscovery/Makefile new file mode 100644 index 000000000..605b55098 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/Makefile @@ -0,0 +1,34 @@ +HAS_HELM_DOCS := $(shell command -v helm-docs;) +HAS_HELM_UNITTEST := $(shell helm plugin list | grep unittest 2> /dev/null) + +.SECONDEXPANSION: +README.md: values.yaml Chart.yaml $$(wildcard README.md.gotmpl) +ifdef HAS_HELM_DOCS + helm-docs +else + docker run --rm --volume "$(shell pwd):/helm-docs" -u $(shell id -u) jnorwood/helm-docs:latest +endif + +Chart.lock: Chart.yaml + helm dependency update . + @touch Chart.lock # Ensure the timestamp is updated + +values.schema.json: values.yaml $$(wildcard schema-mods/*) + ../../scripts/schema-gen.sh . + +.PHONY: clean +clean: + rm -f README.md values.schema.json + +.PHONY: build +build: README.md Chart.lock values.schema.json + +.PHONY: test +test: build + helm lint . + ct lint --lint-conf ../../.configs/lintconf.yaml --check-version-increment=false --charts . +ifdef HAS_HELM_UNITTEST + helm unittest . +else + docker run --rm --volume $(shell pwd):/apps helmunittest/helm-unittest . +endif diff --git a/charts/feature-annotation-autodiscovery/README.md b/charts/feature-annotation-autodiscovery/README.md new file mode 100644 index 000000000..92018742e --- /dev/null +++ b/charts/feature-annotation-autodiscovery/README.md @@ -0,0 +1,65 @@ +# k8s-monitoring-feature-annotation-autodiscovery + +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![AppVersion: 1.0.0](https://img.shields.io/badge/AppVersion-1.0.0-informational?style=flat-square) + +Gathers metrics automatically based on Kubernetes Pod and Service annotations + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| petewall | | | + +## Values + +### Annotations + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| annotations.instance | string | `"k8s.grafana.com/instance"` | Annotation for overriding the instance label | +| annotations.job | string | `"k8s.grafana.com/job"` | Annotation for overriding the job label | +| annotations.metricsPath | string | `"k8s.grafana.com/metrics.path"` | Annotation for setting or overriding the metrics path. If not set, it defaults to /metrics | +| annotations.metricsPortName | string | `"k8s.grafana.com/metrics.portName"` | Annotation for setting the metrics port by name. | +| annotations.metricsPortNumber | string | `"k8s.grafana.com/metrics.portNumber"` | Annotation for setting the metrics port by number. | +| annotations.metricsScheme | string | `"k8s.grafana.com/metrics.scheme"` | Annotation for setting the metrics scheme, default: http. | +| annotations.metricsScrapeInterval | string | `"k8s.grafana.com/metrics.scrapeInterval"` | Annotation for overriding the scrape interval for this service or pod. Value should be a duration like "15s, 1m". Overrides metrics.autoDiscover.scrapeInterval | +| annotations.scrape | string | `"k8s.grafana.com/scrape"` | Annotation for enabling scraping for this service or pod. Value should be either "true" or "false" | + +### Scrape Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| bearerToken | object | `{"enabled":true,"token":"/var/run/secrets/kubernetes.io/serviceaccount/token"}` | Sets bearer_token_file line in the prometheus.scrape annotation_autodiscovery. | +| scrapeInterval | string | 60s | How frequently to scrape metrics from PodMonitor objects. Only used if the PodMonitor does not specify the scrape interval. Overrides global.scrapeInterval | + +### Discovery Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| extraDiscoveryRules | string | `""` | Rule blocks to be added to the prometheus.operator.podmonitors component for PodMonitors. These relabeling rules are applied pre-scrape against the targets from service discovery. The relabelings defined in the PodMonitor object are applied first, then these relabelings are applied. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | + +### Metric Processing Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for PodMonitor objects. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | + +### General settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| fullnameOverride | string | `""` | Full name override | +| nameOverride | string | `""` | Name override | + +### Global Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| global.maxCacheSize | int | `100000` | Sets the max_cache_size for every prometheus.relabel component. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) This should be at least 2x-5x your largest scrape target or samples appended rate. | +| global.scrapeInterval | string | `"60s"` | How frequently to scrape metrics. | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/charts/feature-annotation-autodiscovery/templates/_helpers.tpl b/charts/feature-annotation-autodiscovery/templates/_helpers.tpl new file mode 100644 index 000000000..b42d94195 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/templates/_helpers.tpl @@ -0,0 +1,29 @@ +{{/* +Create a default fully qualified name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "feature.annotationAutodiscovery.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride | lower }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" | lower }} +{{- end }} +{{- end }} +{{- end }} + +{{- define "escape_annotation" -}} +{{ . | replace "-" "_" | replace "." "_" | replace "/" "_" }} +{{- end }} + +{{- define "pod_annotation" -}} +{{ printf "__meta_kubernetes_pod_annotation_%s" (include "escape_annotation" .) }} +{{- end }} + +{{- define "service_annotation" -}} +{{ printf "__meta_kubernetes_service_annotation_%s" (include "escape_annotation" .) }} +{{- end }} diff --git a/charts/feature-annotation-autodiscovery/templates/_module.alloy.tpl b/charts/feature-annotation-autodiscovery/templates/_module.alloy.tpl new file mode 100644 index 000000000..ada62d60b --- /dev/null +++ b/charts/feature-annotation-autodiscovery/templates/_module.alloy.tpl @@ -0,0 +1,232 @@ +{{- define "feature.annotationAutodiscovery.module" }} +declare "annotation_autodiscovery" { + argument "metrics_destinations" { + comment = "Must be a list of metric destinations where collected metrics should be forwarded to" + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.relabel "annotation_autodiscovery_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.scrape }}"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.job }}"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.instance }}"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsPath }}"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the pod port + // The discovery generates a target for each declared container port of the pod. + // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation. + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsPortName }}"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is + // one of the declared ports on that Pod. + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsPortNumber }}", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" + replacement = "[$2]:$1" // IPv6 + target_label = "__address__" + } + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsPortNumber }}", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists + replacement = "$2:$1" + target_label = "__address__" + } + + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsScheme }}"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["{{ include "pod_annotation" .Values.annotations.metricsScrapeInterval }}"] + action = "replace" + target_label = "__scrape_interval__" + } +{{- if .Values.extraDiscoveryRules }} +{{ .Values.extraDiscoveryRules | indent 4 }} +{{- end }} + } + + discovery.kubernetes "services" { + role = "service" + } + + discovery.relabel "annotation_autodiscovery_services" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.scrape }}"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.job }}"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.instance }}"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.metricsPath }}"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the service port + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.metricsPortName }}"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + target_label = "__tmp_port" + } + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.metricsPortNumber }}"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.metricsScheme }}"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["{{ include "service_annotation" .Values.annotations.metricsScrapeInterval }}"] + action = "replace" + target_label = "__scrape_interval__" + } +{{- if .Values.extraDiscoveryRules }} +{{ .Values.extraDiscoveryRules | indent 4 }} +{{- end }} + } + + discovery.relabel "annotation_autodiscovery_http" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "drop" + } + } + + discovery.relabel "annotation_autodiscovery_https" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "keep" + } + } + + prometheus.scrape "annotation_autodiscovery_http" { + targets = discovery.relabel.annotation_autodiscovery_http.output + scrape_interval = {{ .Values.scrapeInterval | default .Values.global.scrapeInterval | quote }} + honor_labels = true +{{- if .Values.bearerToken.enabled }} + bearer_token_file = {{ .Values.bearerToken.token | quote }} +{{- end }} + clustering { + enabled = true + } +{{ if or .Values.metricsTuning.includeMetrics .Values.metricsTuning.excludeMetrics .Values.extraMetricProcessingRules }} + forward_to = [prometheus.relabel.annotation_autodiscovery.receiver] +{{- else }} + forward_to = argument.metrics_destinations.value +{{- end }} + } + + prometheus.scrape "annotation_autodiscovery_https" { + targets = discovery.relabel.annotation_autodiscovery_https.output + scrape_interval = {{ .Values.scrapeInterval | default .Values.global.scrapeInterval | quote }} + honor_labels = true +{{- if .Values.bearerToken.enabled }} + bearer_token_file = {{ .Values.bearerToken.token | quote }} +{{- end }} + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } +{{ if or .Values.metricsTuning.includeMetrics .Values.metricsTuning.excludeMetrics .Values.extraMetricProcessingRules }} + forward_to = [prometheus.relabel.annotation_autodiscovery.receiver] + } + + prometheus.relabel "annotation_autodiscovery" { + max_cache_size = {{ .Values.maxCacheSize | default .Values.global.maxCacheSize | int }} +{{- if .Values.metricsTuning.includeMetrics }} + rule { + source_labels = ["__name__"] + regex = "up|{{ join "|" .Values.metricsTuning.includeMetrics }}" + action = "keep" + } +{{- end }} +{{- if .Values.metricsTuning.excludeMetrics }} + rule { + source_labels = ["__name__"] + regex = {{ join "|" .Values.metricsTuning.excludeMetrics | quote }} + action = "drop" + } +{{- end }} +{{- if .Values.extraMetricProcessingRules }} +{{ .Values.extraMetricProcessingRules | indent 4 }} +{{- end }} +{{- end }} + forward_to = argument.metrics_destinations.value + } +} +{{- end -}} diff --git a/charts/feature-annotation-autodiscovery/templates/_notes.tpl b/charts/feature-annotation-autodiscovery/templates/_notes.tpl new file mode 100644 index 000000000..6aa7d11ec --- /dev/null +++ b/charts/feature-annotation-autodiscovery/templates/_notes.tpl @@ -0,0 +1,7 @@ +{{- define "feature.annotationAutodiscovery.notes.deployments" }}{{- end }} + +{{- define "feature.annotationAutodiscovery.notes.task" }} +Scrape metrics from pods and services with the "{{.Values.annotations.scrape}}: true" annotation +{{- end }} + +{{- define "feature.annotationAutodiscovery.notes.actions" }}{{- end }} diff --git a/charts/feature-annotation-autodiscovery/templates/configmap.yaml b/charts/feature-annotation-autodiscovery/templates/configmap.yaml new file mode 100644 index 000000000..cf16bfba7 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/templates/configmap.yaml @@ -0,0 +1,11 @@ +{{- if .Values.deployAsConfigMap }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "feature.annotationAutodiscovery.fullname" . }} + namespace: {{ .Release.Namespace }} +data: + module.alloy: |- + {{- include "feature.annotationAutodiscovery.module" . | indent 4 }} +{{- end }} diff --git a/charts/feature-annotation-autodiscovery/tests/__snapshot__/.gitkeep b/charts/feature-annotation-autodiscovery/tests/__snapshot__/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/charts/feature-annotation-autodiscovery/tests/default_test.yaml b/charts/feature-annotation-autodiscovery/tests/default_test.yaml new file mode 100644 index 000000000..016651835 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/tests/default_test.yaml @@ -0,0 +1,209 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test default values +templates: + - configmap.yaml +tests: + - it: creates a module with default discovery, scraping, and processing configurations + set: + deployAsConfigMap: true + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "annotation_autodiscovery" { + argument "metrics_destinations" { + comment = "Must be a list of metric destinations where collected metrics should be forwarded to" + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.relabel "annotation_autodiscovery_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the pod port + // The discovery generates a target for each declared container port of the pod. + // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation. + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is + // one of the declared ports on that Pod. + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" + replacement = "[$2]:$1" // IPv6 + target_label = "__address__" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists + replacement = "$2:$1" + target_label = "__address__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_scheme"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_scrapeInterval"] + action = "replace" + target_label = "__scrape_interval__" + } + } + + discovery.kubernetes "services" { + role = "service" + } + + discovery.relabel "annotation_autodiscovery_services" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the service port + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portNumber"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_scheme"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_scrapeInterval"] + action = "replace" + target_label = "__scrape_interval__" + } + } + + discovery.relabel "annotation_autodiscovery_http" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "drop" + } + } + + discovery.relabel "annotation_autodiscovery_https" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "keep" + } + } + + prometheus.scrape "annotation_autodiscovery_http" { + targets = discovery.relabel.annotation_autodiscovery_http.output + scrape_interval = "60s" + honor_labels = true + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + clustering { + enabled = true + } + + forward_to = argument.metrics_destinations.value + } + + prometheus.scrape "annotation_autodiscovery_https" { + targets = discovery.relabel.annotation_autodiscovery_https.output + scrape_interval = "60s" + honor_labels = true + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + + forward_to = argument.metrics_destinations.value + } + } + diff --git a/charts/feature-annotation-autodiscovery/tests/prometheus_annotation_test.yaml b/charts/feature-annotation-autodiscovery/tests/prometheus_annotation_test.yaml new file mode 100644 index 000000000..56ad0dcdd --- /dev/null +++ b/charts/feature-annotation-autodiscovery/tests/prometheus_annotation_test.yaml @@ -0,0 +1,214 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test with prometheus.io annotations +templates: + - configmap.yaml +tests: + - it: creates a module with default discovery, scraping, and processing configurations + set: + deployAsConfigMap: true + annotations: + scrape: prometheus.io/scrape + metricsScheme: prometheus.io/scheme + metricsPath: prometheus.io/path + metricsPort: prometheus.io/port + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "annotation_autodiscovery" { + argument "metrics_destinations" { + comment = "Must be a list of metric destinations where collected metrics should be forwarded to" + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.relabel "annotation_autodiscovery_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the pod port + // The discovery generates a target for each declared container port of the pod. + // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation. + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is + // one of the declared ports on that Pod. + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" + replacement = "[$2]:$1" // IPv6 + target_label = "__address__" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_portNumber", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists + replacement = "$2:$1" + target_label = "__address__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scheme"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_k8s_grafana_com_metrics_scrapeInterval"] + action = "replace" + target_label = "__scrape_interval__" + } + } + + discovery.kubernetes "services" { + role = "service" + } + + discovery.relabel "annotation_autodiscovery_services" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the service port + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_portNumber"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scheme"] + action = "replace" + target_label = "__scheme__" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_k8s_grafana_com_metrics_scrapeInterval"] + action = "replace" + target_label = "__scrape_interval__" + } + } + + discovery.relabel "annotation_autodiscovery_http" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "drop" + } + } + + discovery.relabel "annotation_autodiscovery_https" { + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + rule { + source_labels = ["__scheme__"] + regex = "https" + action = "keep" + } + } + + prometheus.scrape "annotation_autodiscovery_http" { + targets = discovery.relabel.annotation_autodiscovery_http.output + scrape_interval = "60s" + honor_labels = true + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + clustering { + enabled = true + } + + forward_to = argument.metrics_destinations.value + } + + prometheus.scrape "annotation_autodiscovery_https" { + targets = discovery.relabel.annotation_autodiscovery_https.output + scrape_interval = "60s" + honor_labels = true + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + + forward_to = argument.metrics_destinations.value + } + } + diff --git a/charts/feature-annotation-autodiscovery/values.schema.json b/charts/feature-annotation-autodiscovery/values.schema.json new file mode 100644 index 000000000..096f562db --- /dev/null +++ b/charts/feature-annotation-autodiscovery/values.schema.json @@ -0,0 +1,89 @@ +{ + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "annotations": { + "type": "object", + "properties": { + "instance": { + "type": "string" + }, + "job": { + "type": "string" + }, + "metricsPath": { + "type": "string" + }, + "metricsPortName": { + "type": "string" + }, + "metricsPortNumber": { + "type": "string" + }, + "metricsScheme": { + "type": "string" + }, + "metricsScrapeInterval": { + "type": "string" + }, + "scrape": { + "type": "string" + } + } + }, + "bearerToken": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "token": { + "type": "string" + } + } + }, + "deployAsConfigMap": { + "type": "boolean" + }, + "extraDiscoveryRules": { + "type": "string" + }, + "extraMetricProcessingRules": { + "type": "string" + }, + "fullnameOverride": { + "type": "string" + }, + "global": { + "type": "object", + "properties": { + "maxCacheSize": { + "type": "integer" + }, + "scrapeInterval": { + "type": "string" + } + } + }, + "maxCacheSize": { + "type": "null" + }, + "metricsTuning": { + "type": "object", + "properties": { + "excludeMetrics": { + "type": "array" + }, + "includeMetrics": { + "type": "array" + } + } + }, + "nameOverride": { + "type": "string" + }, + "scrapeInterval": { + "type": "string" + } + } +} diff --git a/charts/feature-annotation-autodiscovery/values.yaml b/charts/feature-annotation-autodiscovery/values.yaml new file mode 100644 index 000000000..9313e7e62 --- /dev/null +++ b/charts/feature-annotation-autodiscovery/values.yaml @@ -0,0 +1,94 @@ +--- +# -- Name override +# @section -- General settings +nameOverride: "" + +# -- Full name override +# @section -- General settings +fullnameOverride: "" + +global: + # -- How frequently to scrape metrics. + # @section -- Global Settings + scrapeInterval: 60s + + # -- Sets the max_cache_size for every prometheus.relabel component. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) + # This should be at least 2x-5x your largest scrape target or samples appended rate. + # @section -- Global Settings + maxCacheSize: 100000 + +# Annotations that are used to discover and configure metric scraping targets. Add these annotations +# to your services or pods to control how autodiscovery will find and scrape metrics from your service or pod. +annotations: + # -- Annotation for enabling scraping for this service or pod. Value should be either "true" or "false" + # @section -- Annotations + scrape: "k8s.grafana.com/scrape" + # -- Annotation for overriding the job label + # @section -- Annotations + job: "k8s.grafana.com/job" + # -- Annotation for overriding the instance label + # @section -- Annotations + instance: "k8s.grafana.com/instance" + # -- Annotation for setting or overriding the metrics path. If not set, it defaults to /metrics + # @section -- Annotations + metricsPath: "k8s.grafana.com/metrics.path" + # -- Annotation for setting the metrics port by name. + # @section -- Annotations + metricsPortName: "k8s.grafana.com/metrics.portName" + # -- Annotation for setting the metrics port by number. + # @section -- Annotations + metricsPortNumber: "k8s.grafana.com/metrics.portNumber" + # -- Annotation for setting the metrics scheme, default: http. + # @section -- Annotations + metricsScheme: "k8s.grafana.com/metrics.scheme" + # -- Annotation for overriding the scrape interval for this service or pod. Value should be a duration like "15s, 1m". + # Overrides metrics.autoDiscover.scrapeInterval + # @section -- Annotations + metricsScrapeInterval: "k8s.grafana.com/metrics.scrapeInterval" + +# -- Rule blocks to be added to the prometheus.operator.podmonitors component for PodMonitors. +# These relabeling rules are applied pre-scrape against the targets from service discovery. +# The relabelings defined in the PodMonitor object are applied first, then these relabelings are applied. +# Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. +# ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) +# @section -- Discovery Settings +extraDiscoveryRules: "" + +# -- How frequently to scrape metrics from PodMonitor objects. Only used if the PodMonitor does not specify the scrape interval. +# Overrides global.scrapeInterval +# @default -- 60s +# @section -- Scrape Settings +scrapeInterval: "" + +# Adjustments to the scraped metrics to filter the amount of metrics sent to storage. +# @section -- Metric Processing Settings +metricsTuning: + # -- Metrics to keep. Can use regular expressions. + # @section -- Metric Processing Settings + includeMetrics: [] + # -- Metrics to drop. Can use regular expressions. + # @section -- Metric Processing Settings + excludeMetrics: [] + +# -- Rule blocks to be added to the prometheus.relabel component for PodMonitor objects. +# These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. +# ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) +# @section -- Metric Processing Settings +extraMetricProcessingRules: "" + +# -- Sets the max_cache_size for cadvisor prometheus.relabel component. +# This should be at least 2x-5x your largest scrape target or samples appended rate. +# ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) +# Overrides global.maxCacheSize +# @raw +# @section -- Metric Processing Settings +maxCacheSize: + +# -- Sets bearer_token_file line in the prometheus.scrape annotation_autodiscovery. +# @section -- Scrape Settings +bearerToken: + enabled: true + token: /var/run/secrets/kubernetes.io/serviceaccount/token + +# @ignore +deployAsConfigMap: false diff --git a/charts/feature-application-observability/.helmignore b/charts/feature-application-observability/.helmignore new file mode 100644 index 000000000..2b29eaf56 --- /dev/null +++ b/charts/feature-application-observability/.helmignore @@ -0,0 +1,6 @@ +docs +schema-mods +tests +Makefile +README.md +README.md.gotmpl diff --git a/charts/feature-application-observability/Chart.lock b/charts/feature-application-observability/Chart.lock new file mode 100644 index 000000000..e39a95f5c --- /dev/null +++ b/charts/feature-application-observability/Chart.lock @@ -0,0 +1,3 @@ +dependencies: [] +digest: sha256:643d5437104296e21d906ecb15b2c96ad278f20cfc4af53b12bb6069bd853726 +generated: "2024-09-25T13:46:10.334192-05:00" diff --git a/charts/feature-application-observability/Chart.yaml b/charts/feature-application-observability/Chart.yaml new file mode 100644 index 000000000..2a61ef764 --- /dev/null +++ b/charts/feature-application-observability/Chart.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v2 +name: k8s-monitoring-feature-application-observability +description: Gathers application data +icon: https://raw.githubusercontent.com/grafana/grafana/main/public/img/grafana_icon.svg +version: 1.0.0 +appVersion: 1.0.0 +maintainers: + - email: pete.wall@grafana.com + name: petewall +dependencies: [] diff --git a/charts/feature-application-observability/Makefile b/charts/feature-application-observability/Makefile new file mode 100644 index 000000000..605b55098 --- /dev/null +++ b/charts/feature-application-observability/Makefile @@ -0,0 +1,34 @@ +HAS_HELM_DOCS := $(shell command -v helm-docs;) +HAS_HELM_UNITTEST := $(shell helm plugin list | grep unittest 2> /dev/null) + +.SECONDEXPANSION: +README.md: values.yaml Chart.yaml $$(wildcard README.md.gotmpl) +ifdef HAS_HELM_DOCS + helm-docs +else + docker run --rm --volume "$(shell pwd):/helm-docs" -u $(shell id -u) jnorwood/helm-docs:latest +endif + +Chart.lock: Chart.yaml + helm dependency update . + @touch Chart.lock # Ensure the timestamp is updated + +values.schema.json: values.yaml $$(wildcard schema-mods/*) + ../../scripts/schema-gen.sh . + +.PHONY: clean +clean: + rm -f README.md values.schema.json + +.PHONY: build +build: README.md Chart.lock values.schema.json + +.PHONY: test +test: build + helm lint . + ct lint --lint-conf ../../.configs/lintconf.yaml --check-version-increment=false --charts . +ifdef HAS_HELM_UNITTEST + helm unittest . +else + docker run --rm --volume $(shell pwd):/apps helmunittest/helm-unittest . +endif diff --git a/charts/feature-application-observability/README.md b/charts/feature-application-observability/README.md new file mode 100644 index 000000000..34f945e0c --- /dev/null +++ b/charts/feature-application-observability/README.md @@ -0,0 +1,79 @@ +# k8s-monitoring-feature-application-observability + +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![AppVersion: 1.0.0](https://img.shields.io/badge/AppVersion-1.0.0-informational?style=flat-square) + +Gathers application data + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| petewall | | | + +## Values + +### General settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| fullnameOverride | string | `""` | Full name override | +| nameOverride | string | `""` | Name override | + +### Processors: Batch + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| processors.batch.maxSize | int | `0` | The upper limit of the amount of data contained in a single batch, in bytes. When set to 0, batches can be any size. | +| processors.batch.size | int | `16384` | What batch size to use, in bytes | +| processors.batch.timeout | string | `"2s"` | How long before sending (Processors) | + +### Processors: Grafana Cloud Host Info + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| processors.grafanaCloudMetrics.enabled | bool | `true` | Generate host info metrics from telemetry data, used in Application Observability in Grafana Cloud. | + +### Processors: K8s Attributes + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| processors.k8sattributes.annotations | list | `[]` | Kubernetes annotations to extract and add to the attributes of the received telemetry data. | +| processors.k8sattributes.labels | list | `[]` | Kubernetes labels to extract and add to the attributes of the received telemetry data. | +| processors.k8sattributes.metadata | list | `["k8s.namespace.name","k8s.pod.name","k8s.deployment.name","k8s.statefulset.name","k8s.daemonset.name","k8s.cronjob.name","k8s.job.name","k8s.node.name","k8s.pod.uid","k8s.pod.start_time"]` | Kubernetes metadata to extract and add to the attributes of the received telemetry data. | + +### Processors: Memory Limiter + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| processors.memoryLimiter.checkInterval | string | `"1s"` | How often to check memory usage. | +| processors.memoryLimiter.enabled | bool | `false` | Use a memory limiter. | +| processors.memoryLimiter.limit | string | `"0MiB"` | Maximum amount of memory targeted to be allocated by the process heap. | + +### Other Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| logs.enabled | bool | `true` | | +| logs.filters | object | `{"log_record":[]}` | Apply a filter to logs received via receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) | +| logs.transforms | object | `{"labels":["cluster","namespace","job","pod"],"log":[],"resource":[]}` | Apply a transformation to logs received via the OTLP or OTLP HTTP receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) | +| logs.transforms.labels | list | `["cluster","namespace","job","pod"]` | The list of labels to set in the log stream. | +| logs.transforms.log | list | `[]` | Log transformation rules. | +| logs.transforms.resource | list | `[]` | Resource transformation rules. | +| metrics.enabled | bool | `true` | | +| metrics.filters | object | `{"datapoint":[],"metric":[]}` | Apply a filter to metrics received via the OTLP or OTLP HTTP receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) | +| metrics.transforms | object | `{"datapoint":[],"metric":[],"resource":[]}` | Apply a transformation to metrics received via the OTLP or OTLP HTTP receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) | +| receivers.grpc.enabled | bool | `false` | | +| receivers.grpc.include_debug_metrics | bool | `false` | | +| receivers.grpc.port | int | `4317` | | +| receivers.http.enabled | bool | `false` | | +| receivers.http.include_debug_metrics | bool | `false` | | +| receivers.http.port | int | `4318` | | +| receivers.zipkin.enabled | bool | `false` | | +| receivers.zipkin.include_debug_metrics | bool | `false` | | +| receivers.zipkin.port | int | `9411` | | +| traces.enabled | bool | `true` | | +| traces.filters | object | `{"span":[],"spanevent":[]}` | Apply a filter to traces received via the OTLP or OTLP HTTP receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) | +| traces.transforms | object | `{"resource":[],"span":[],"spanevent":[]}` | Apply a transformation to traces received via the OTLP or OTLP HTTP receivers. ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/charts/feature-application-observability/templates/_connector_host_info.tpl b/charts/feature-application-observability/templates/_connector_host_info.tpl new file mode 100644 index 000000000..0d9046391 --- /dev/null +++ b/charts/feature-application-observability/templates/_connector_host_info.tpl @@ -0,0 +1,14 @@ +{{/* Inputs: Values (values) metricsOutput, name */}} +{{/* https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.connector.host_info/ */}} +{{- define "feature.applicationObservability.connector.host_info.alloy.target" }}otelcol.connector.host_info.{{ .name | default "default" }}.input{{- end }} +{{- define "feature.applicationObservability.connector.host_info.alloy" }} +otelcol.connector.host_info "{{ .name | default "default" }}" { + host_identifiers = [ "k8s.node.name" ] + + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_helpers.tpl b/charts/feature-application-observability/templates/_helpers.tpl new file mode 100644 index 000000000..edef617ad --- /dev/null +++ b/charts/feature-application-observability/templates/_helpers.tpl @@ -0,0 +1,30 @@ +{{/* +Create a default fully qualified name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "feature.applicationObservability.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride | lower }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" | lower }} +{{- end }} +{{- end }} +{{- end }} + +{{- define "english_list" }} +{{- if eq (len .) 0 }} +{{- else if eq (len .) 1 }} +{{- index . 0 }} +{{- else if eq (len .) 2 }} +{{- index . 0 }} and {{ index . 1 }} +{{- else }} +{{- $last := index . (sub (len .) 1) }} +{{- $rest := slice . 0 (sub (len .) 1) }} +{{- join ", " $rest }}, and {{ $last }} +{{- end }} +{{- end }} diff --git a/charts/feature-application-observability/templates/_module.alloy.tpl b/charts/feature-application-observability/templates/_module.alloy.tpl new file mode 100644 index 000000000..6e21ff000 --- /dev/null +++ b/charts/feature-application-observability/templates/_module.alloy.tpl @@ -0,0 +1,84 @@ +{{- define "feature.applicationObservability.module" }} +{{- $metricsNext := "" }} +{{- $logsNext := "" }} +{{- $resourceDetection := include "feature.applicationObservability.processor.resourcedetection.alloy.target" dict }} +{{- $k8sAttributes := include "feature.applicationObservability.processor.k8sattributes.alloy.target" dict }} +{{- $grafanaCloudMetrics := include "feature.applicationObservability.connector.host_info.alloy.target" dict }} +{{- $transform := include "feature.applicationObservability.processor.transform.alloy.target" dict }} +{{- $filter := include "feature.applicationObservability.processor.filter.alloy.target" dict }} +{{- $batch := include "feature.applicationObservability.processor.batch.alloy.target" dict }} +{{- $memoryLimiter := include "feature.applicationObservability.processor.memory_limiter.alloy.target" dict }} +declare "application_observability" { + argument "metrics_destinations" { + comment = "Must be a list of metrics destinations where collected metrics should be forwarded to" + } + + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + argument "traces_destinations" { + comment = "Must be a list of trace destinations where collected trace should be forwarded to" + } + + // Receivers --> Resource Detection Processor + {{- $next := printf "[%s]" $resourceDetection }} + {{- include "feature.applicationObservability.receiver.otlp.alloy" (dict "Values" $.Values "metricsOutput" $next "logsOutput" $next "tracesOutput" $next ) | indent 2 }} + {{- include "feature.applicationObservability.receiver.zipkin.alloy" (dict "Values" $.Values "tracesOutput" $next ) | indent 2 }} + + // Resource Detection Processor --> K8s Attribute Processor + {{- $next = printf "[%s]" $k8sAttributes }} + {{- include "feature.applicationObservability.processor.resourcedetection.alloy" (dict "Values" $.Values "metricsOutput" $next "logsOutput" $next "tracesOutput" $next ) | indent 2 }} + + // K8s Attribute Processor --> Transform Processor + {{- $tracesNext := list $transform }} + {{- $next = printf "[%s]" $transform }} +{{- if .Values.processors.grafanaCloudMetrics.enabled }} + // Resource Detection Processor Traces --> Host Info Connector + {{- $tracesNext = append $tracesNext $grafanaCloudMetrics }} +{{- end -}} + {{- $tracesNext = printf "[%s]" ($tracesNext | join ", ")}} + {{- include "feature.applicationObservability.processor.k8sattributes.alloy" (dict "Values" $.Values "metricsOutput" $next "logsOutput" $next "tracesOutput" $tracesNext ) | indent 2 }} + +{{- if .Values.processors.grafanaCloudMetrics.enabled }} + // Host Info Connector --> Batch Processor + {{- $next = printf "[%s]" $batch }} + {{- include "feature.applicationObservability.connector.host_info.alloy" (dict "Values" $.Values "metricsOutput" $next ) | indent 2 }} +{{- end }} + +{{ if eq (include "feature.applicationObservability.processor.filter.enabled" .) "true" }} + // Transform Processor --> Filter Processor + {{- $next = printf "[%s]" $filter }} +{{- else }} + // Transform Processor --> Batch Processor + {{- $next = printf "[%s]" $batch }} +{{- end }} + {{- include "feature.applicationObservability.processor.transform.alloy" (dict "Values" $.Values "metricsOutput" $next "logsOutput" $next "tracesOutput" $next ) | indent 2 }} +{{ if eq (include "feature.applicationObservability.processor.filter.enabled" .) "true" }} + // Filter Processor --> Batch Processor + {{- $next = printf "[%s]" $batch }} + {{- include "feature.applicationObservability.processor.filter.alloy" (dict "Values" $.Values "metricsOutput" $next "logsOutput" $next "tracesOutput" $next ) | indent 2 }} +{{- end }} + +{{- if .Values.processors.memoryLimiter.enabled }} + // Batch Processor --> Memory Limiter + {{- $metricsNext = printf "[%s]" $memoryLimiter }} + {{- $logsNext = printf "[%s]" $memoryLimiter }} + {{- $tracesNext = printf "[%s]" $memoryLimiter }} +{{- else }} + // Batch Processor --> Destinations + {{- $metricsNext = "argument.metrics_destinations.value" }} + {{- $logsNext = "argument.logs_destinations.value" }} + {{- $tracesNext = "argument.traces_destinations.value" }} +{{- end }} + {{- include "feature.applicationObservability.processor.batch.alloy" (dict "Values" $.Values "metricsOutput" $metricsNext "logsOutput" $logsNext "tracesOutput" $tracesNext ) | indent 2 }} + +{{- if .Values.processors.memoryLimiter.enabled }} + // Memory Limiter --> Destinations + {{- $metricsNext = "argument.metrics_destinations.value" }} + {{- $logsNext = "argument.logs_destinations.value" }} + {{- $tracesNext = "argument.traces_destinations.value" }} + {{- include "feature.applicationObservability.processor.memory_limiter.alloy" (dict "Values" $.Values "metricsOutput" $metricsNext "logsOutput" $logsNext "tracesOutput" $tracesNext ) | indent 2 }} +{{- end }} +} +{{- end }} diff --git a/charts/feature-application-observability/templates/_notes.tpl b/charts/feature-application-observability/templates/_notes.tpl new file mode 100644 index 000000000..6140c4cba --- /dev/null +++ b/charts/feature-application-observability/templates/_notes.tpl @@ -0,0 +1,23 @@ +{{- define "feature.applicationObservability.notes.deployments" }}{{- end }} + +{{- define "feature.applicationObservability.notes.task" }} +{{- $receivers := list }} +{{- if .Values.receivers.grpc.enabled }}{{- $receivers = append $receivers "OTLP gRPC" }}{{ end }} +{{- if .Values.receivers.http.enabled }}{{- $receivers = append $receivers "OTLP HTTP" }}{{ end }} +{{- if .Values.receivers.zipkin.enabled }}{{- $receivers = append $receivers "Zipkin" }}{{ end }} +{{- $receiverWord := len $receivers | plural "receiver" "receivers" }} +Gather application data via {{ include "english_list" $receivers }} {{ $receiverWord }} +{{- end }} + +{{- define "feature.applicationObservability.notes.actions" }} +Configure your applications to send telemetry data to: +{{- if .Values.receivers.grpc.enabled }} +* http://{{ .Collector.ServiceName }}.{{ .Collector.Namespace }}.svc.cluster.local:{{ .Values.receivers.grpc.port }} (OTLP gRPC) +{{ end }} +{{- if .Values.receivers.http.enabled }} +* http://{{ .Collector.ServiceName }}.{{ .Collector.Namespace }}.svc.cluster.local:{{ .Values.receivers.http.port }} (OTLP HTTP) +{{ end }} +{{- if .Values.receivers.zipkin.enabled }} +* http://{{ .Collector.ServiceName }}.{{ .Collector.Namespace }}.svc.cluster.local:{{ .Values.receivers.zipkin.port }} (Zipkin) +{{ end }} +{{- end }} diff --git a/charts/feature-application-observability/templates/_processor_batch.tpl b/charts/feature-application-observability/templates/_processor_batch.tpl new file mode 100644 index 000000000..c9815d368 --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_batch.tpl @@ -0,0 +1,17 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{- define "feature.applicationObservability.processor.batch.alloy.target" }}otelcol.processor.batch.{{ .name | default "default" }}.input{{ end }} +{{- define "feature.applicationObservability.processor.batch.alloy" }} +otelcol.processor.batch "{{ .name | default "default" }}" { + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_processor_filter.tpl b/charts/feature-application-observability/templates/_processor_filter.tpl new file mode 100644 index 000000000..95e649469 --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_filter.tpl @@ -0,0 +1,70 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{/* https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.processor.filter/ */}} +{{- define "feature.applicationObservability.processor.filter.enabled" }} +{{- if or (and .Values.metrics.enabled (or .Values.metrics.filters.metric .Values.metrics.filters.datapoint)) (and .Values.logs.enabled .Values.logs.filters.log_record) (and .Values.traces.enabled (or .Values.traces.filters.span .Values.traces.filters.spanevent)) }} +true +{{- else }} +false +{{- end }} +{{- end }} +{{- define "feature.applicationObservability.processor.filter.alloy.target" }}otelcol.processor.filter.{{ .name | default "default" }}.target{{ end }} +{{- define "feature.applicationObservability.processor.filter.alloy" }} +otelcol.processor.filter "{{ .name | default "default" }}" { +{{- if and .Values.metrics.enabled (or .Values.metrics.filters.metric .Values.metrics.filters.datapoint) }} + metrics { +{{- if .Values.metrics.filters.metric }} + metric = [ +{{- range $filter := .Values.metrics.filters.metric }} +{{ $filter | quote | indent 6 }}, +{{- end }} + ] +{{- end }} +{{- if .Values.metrics.filters.datapoint }} + datapoint = [ +{{- range $filter := .Values.metrics.filters.datapoint }} +{{ $filter | quote | indent 6 }}, +{{- end }} + ] +{{- end }} + } +{{- end }} +{{- if and .Values.logs.enabled .Values.logs.filters.log_record }} + logs { + log_record = [ +{{- range $filter := .Values.logs.filters.log_record }} +{{ $filter | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- if and .Values.traces.enabled (or .Values.traces.filters.span .Values.traces.filters.spanevent) }} + traces { +{{- if .Values.traces.filters.span }} + span = [ +{{- range $filter := .Values.traces.filters.span }} +{{ $filter | quote | indent 6 }}, +{{- end }} + ] +{{- end }} +{{- if .Values.traces.filters.spanevent }} + spanevent = [ +{{- range $filter := .Values.traces.filters.spanevent }} +{{ $filter | quote | indent 6 }}, +{{- end }} + ] +{{- end }} + } +{{- end }} + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_processor_k8sattributes.tpl b/charts/feature-application-observability/templates/_processor_k8sattributes.tpl new file mode 100644 index 000000000..03695d2b6 --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_k8sattributes.tpl @@ -0,0 +1,43 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{/* https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.processor.k8sattributes/ */}} +{{- define "feature.applicationObservability.processor.k8sattributes.alloy.target" }}otelcol.processor.k8sattributes.{{ .name | default "default" }}.input{{ end }} +{{- define "feature.applicationObservability.processor.k8sattributes.alloy" }} +otelcol.processor.k8sattributes "{{ .name | default "default" }}" { + extract { +{{- if .Values.processors.k8sattributes.metadata }} + metadata = {{ .Values.processors.k8sattributes.metadata | toJson }} +{{- end }} +{{- range .Values.processors.k8sattributes.labels }} + label { + {{- range $k, $v := . }} + {{ $k }} = {{ $v | quote }} + {{- end }} + } +{{- end }} +{{- range .Values.processors.k8sattributes.annotations }} + annotation { + {{- range $k, $v := . }} + {{ $k }} = {{ $v | quote }} + {{- end }} + } +{{- end }} + } + pod_association { + source { + from = "connection" + } + } + + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_processor_memory_limiter.tpl b/charts/feature-application-observability/templates/_processor_memory_limiter.tpl new file mode 100644 index 000000000..51798624a --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_memory_limiter.tpl @@ -0,0 +1,20 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{- define "feature.applicationObservability.processor.memory_limiter.alloy.target" }}otelcol.processor.memory_limiter.{{ .name | default "default" }}.target{{ end }} +{{- define "feature.applicationObservability.processor.memory_limiter.alloy" }} +otelcol.processor.memory_limiter "{{ .name | default "default" }}" { + check_interval = {{ .Values.processors.memoryLimiter.checkInterval | quote }} + limit = {{ .Values.processors.memoryLimiter.limit | quote }} + + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_processor_resourcedetection.tpl b/charts/feature-application-observability/templates/_processor_resourcedetection.tpl new file mode 100644 index 000000000..33063e91c --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_resourcedetection.tpl @@ -0,0 +1,23 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{/* https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.processor.resourcedetection/ */}} +{{- define "feature.applicationObservability.processor.resourcedetection.alloy.target" }}otelcol.processor.resourcedetection.{{ .name | default "default" }}.input{{ end }} +{{- define "feature.applicationObservability.processor.resourcedetection.alloy" }} +otelcol.processor.resourcedetection "{{ .name | default "default" }}" { + detectors = ["env", "system"] + system { + hostname_sources = ["os"] + } + + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_processor_transform.tpl b/charts/feature-application-observability/templates/_processor_transform.tpl new file mode 100644 index 000000000..e53648c02 --- /dev/null +++ b/charts/feature-application-observability/templates/_processor_transform.tpl @@ -0,0 +1,110 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput, name */}} +{{/* https://grafana.com/docs/alloy/latest/reference/components/otelcol/otelcol.processor.transform/ */}} +{{- define "feature.applicationObservability.processor.transform.alloy.target" }}otelcol.processor.transform.{{ .name | default "default" }}.input{{ end }} +{{- define "feature.applicationObservability.processor.transform.alloy" }} +otelcol.processor.transform "{{ .name | default "default" }}" { + error_mode = "ignore" + +{{- if .Values.metrics.enabled }} +{{- if .Values.metrics.transforms.resource }} + metric_statements { + context = "resource" + statements = [ +{{- range $transform := .Values.metrics.transforms.resource }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- if .Values.metrics.transforms.metric }} + metric_statements { + context = "metric" + statements = [ +{{- range $transform := .Values.metrics.transforms.metric }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- if .Values.metrics.transforms.datapoint }} + metric_statements { + context = "datapoint" + statements = [ +{{- range $transform := .Values.metrics.transforms.datapoint }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- end }} +{{- if .Values.logs.enabled }} + log_statements { + context = "resource" + statements = [ +{{- if .Values.logs.transforms.resource }} +{{- range $transform := .Values.logs.transforms.resource }} +{{ $transform | quote | indent 6 }}, +{{- end }} +{{- end }} + "set(attributes[\"pod\"], attributes[\"k8s.pod.name\"])", + "set(attributes[\"namespace\"], attributes[\"k8s.namespace.name\"])", + "set(attributes[\"loki.resource.labels\"], \"{{ .Values.logs.transforms.labels | join ", " }}\")", + ] + } +{{- if .Values.logs.transforms.log }} + log_statements { + context = "log" + statements = [ +{{- range $transform := .Values.logs.transforms.log }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- end }} +{{- if .Values.traces.enabled }} +{{- if .Values.traces.transforms.resource }} + trace_statements { + context = "resource" + statements = [ +{{- range $transform := .Values.traces.transforms.resource }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- if .Values.traces.transforms.span }} + trace_statements { + context = "span" + statements = [ +{{- range $transform := .Values.traces.transforms.span }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- if .Values.traces.transforms.spanevent }} + trace_statements { + context = "spanevent" + statements = [ +{{- range $transform := .Values.traces.transforms.spanevent }} +{{ $transform | quote | indent 6 }}, +{{- end }} + ] + } +{{- end }} +{{- end }} + + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} \ No newline at end of file diff --git a/charts/feature-application-observability/templates/_receiver_otlp.tpl b/charts/feature-application-observability/templates/_receiver_otlp.tpl new file mode 100644 index 000000000..70ef8d3bd --- /dev/null +++ b/charts/feature-application-observability/templates/_receiver_otlp.tpl @@ -0,0 +1,31 @@ +{{/* Inputs: Values (values) metricsOutput, logsOutput, tracesOutput */}} +{{- define "feature.applicationObservability.receiver.otlp.alloy" }} +{{- if or .Values.receivers.grpc.enabled .Values.receivers.http.enabled }} +otelcol.receiver.otlp "receiver" { +{{- if .Values.receivers.grpc.enabled }} + grpc { + endpoint = "0.0.0.0:{{ .Values.receivers.grpc.port | int }}" + } +{{- end }} +{{- if .Values.receivers.http.enabled }} + http { + endpoint = "0.0.0.0:{{ .Values.receivers.http.port | int }}" + } +{{- end }} + debug_metrics { + disable_high_cardinality_metrics = {{ not (or .Values.receivers.grpc.include_debug_metrics .Values.receivers.http.include_debug_metrics) }} + } + output { +{{- if and .metricsOutput .Values.metrics.enabled }} + metrics = {{ .metricsOutput }} +{{- end }} +{{- if and .logsOutput .Values.logs.enabled }} + logs = {{ .logsOutput }} +{{- end }} +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} +{{- end }} diff --git a/charts/feature-application-observability/templates/_receiver_zipkin.tpl b/charts/feature-application-observability/templates/_receiver_zipkin.tpl new file mode 100644 index 000000000..a0083cf1c --- /dev/null +++ b/charts/feature-application-observability/templates/_receiver_zipkin.tpl @@ -0,0 +1,16 @@ +{{/* Inputs: Values (values) tracesOutput */}} +{{- define "feature.applicationObservability.receiver.zipkin.alloy" }} +{{- if .Values.receivers.zipkin.enabled }} +otelcol.receiver.zipkin "receiver" { + endpoint = "0.0.0.0:{{ .Values.receivers.zipkin.port | int }}" + debug_metrics { + disable_high_cardinality_metrics = {{ not .Values.receivers.zipkin.include_debug_metrics }} + } + output { +{{- if and .tracesOutput .Values.traces.enabled }} + traces = {{ .tracesOutput }} +{{- end }} + } +} +{{- end }} +{{- end }} diff --git a/charts/feature-application-observability/templates/_validation.tpl b/charts/feature-application-observability/templates/_validation.tpl new file mode 100644 index 000000000..045439e3f --- /dev/null +++ b/charts/feature-application-observability/templates/_validation.tpl @@ -0,0 +1,13 @@ +{{- define "feature.applicationObservability.validate" }} +{{- $aRecevierIsEnabled := or .Values.receivers.grpc.enabled .Values.receivers.http.enabled .Values.receivers.zipkin.enabled }} +{{- if not $aRecevierIsEnabled }} + {{- $msg := list "" "At least one receiver must be enabled to use Application Observability." }} + {{- $msg = append $msg "Please enable one. For example:" }} + {{- $msg = append $msg "applicationObservability:" }} + {{- $msg = append $msg " receivers:" }} + {{- $msg = append $msg " grpc:" }} + {{- $msg = append $msg " enabled: true" }} + {{- $msg = append $msg "See https://github.com/grafana/k8s-monitoring-helm/blob/main/charts/feature-application-observability for more details." }} + {{- fail (join "\n" $msg) }} +{{- end }} +{{- end }} diff --git a/charts/feature-application-observability/templates/configmap.yaml b/charts/feature-application-observability/templates/configmap.yaml new file mode 100644 index 000000000..22a013858 --- /dev/null +++ b/charts/feature-application-observability/templates/configmap.yaml @@ -0,0 +1,12 @@ +{{- if .Values.deployAsConfigMap }} +{{- include "feature.applicationObservability.validate" . }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "feature.applicationObservability.fullname" . }} + namespace: {{ .Release.Namespace }} +data: + module.alloy: |- + {{- include "feature.applicationObservability.module" . | indent 4 }} +{{- end }} diff --git a/charts/feature-application-observability/tests/__snapshot__/.gitkeep b/charts/feature-application-observability/tests/__snapshot__/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/charts/feature-application-observability/tests/default_test.yaml b/charts/feature-application-observability/tests/default_test.yaml new file mode 100644 index 000000000..6e6ffbdc2 --- /dev/null +++ b/charts/feature-application-observability/tests/default_test.yaml @@ -0,0 +1,131 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test default values +templates: + - configmap.yaml +tests: + - it: creates the default pipeline + set: + deployAsConfigMap: true + receivers: + grpc: + enabled: true + http: + enabled: true + zipkin: + enabled: true + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "application_observability" { + argument "metrics_destinations" { + comment = "Must be a list of metrics destinations where collected metrics should be forwarded to" + } + + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + argument "traces_destinations" { + comment = "Must be a list of trace destinations where collected trace should be forwarded to" + } + + // Receivers --> Resource Detection Processor + otelcol.receiver.otlp "receiver" { + grpc { + endpoint = "0.0.0.0:4317" + } + http { + endpoint = "0.0.0.0:4318" + } + debug_metrics { + disable_high_cardinality_metrics = true + } + output { + metrics = [otelcol.processor.resourcedetection.default.input] + logs = [otelcol.processor.resourcedetection.default.input] + traces = [otelcol.processor.resourcedetection.default.input] + } + } + otelcol.receiver.zipkin "receiver" { + endpoint = "0.0.0.0:9411" + debug_metrics { + disable_high_cardinality_metrics = true + } + output { + traces = [otelcol.processor.resourcedetection.default.input] + } + } + + // Resource Detection Processor --> K8s Attribute Processor + otelcol.processor.resourcedetection "default" { + detectors = ["env", "system"] + system { + hostname_sources = ["os"] + } + + output { + metrics = [otelcol.processor.k8sattributes.default.input] + logs = [otelcol.processor.k8sattributes.default.input] + traces = [otelcol.processor.k8sattributes.default.input] + } + } + + // K8s Attribute Processor --> Transform Processor + // Resource Detection Processor Traces --> Host Info Connector + otelcol.processor.k8sattributes "default" { + extract { + metadata = ["k8s.namespace.name","k8s.pod.name","k8s.deployment.name","k8s.statefulset.name","k8s.daemonset.name","k8s.cronjob.name","k8s.job.name","k8s.node.name","k8s.pod.uid","k8s.pod.start_time"] + } + pod_association { + source { + from = "connection" + } + } + + output { + metrics = [otelcol.processor.transform.default.input] + logs = [otelcol.processor.transform.default.input] + traces = [otelcol.processor.transform.default.input, otelcol.connector.host_info.default.input] + } + } + // Host Info Connector --> Batch Processor + otelcol.connector.host_info "default" { + host_identifiers = [ "k8s.node.name" ] + + output { + metrics = [otelcol.processor.batch.default.input] + } + } + + + // Transform Processor --> Batch Processor + otelcol.processor.transform "default" { + error_mode = "ignore" + log_statements { + context = "resource" + statements = [ + "set(attributes[\"pod\"], attributes[\"k8s.pod.name\"])", + "set(attributes[\"namespace\"], attributes[\"k8s.namespace.name\"])", + "set(attributes[\"loki.resource.labels\"], \"cluster, namespace, job, pod\")", + ] + } + + output { + metrics = [otelcol.processor.batch.default.input] + logs = [otelcol.processor.batch.default.input] + traces = [otelcol.processor.batch.default.input] + } + } + + // Batch Processor --> Destinations + otelcol.processor.batch "default" { + output { + metrics = argument.metrics_destinations.value + logs = argument.logs_destinations.value + traces = argument.traces_destinations.value + } + } + } diff --git a/charts/feature-application-observability/tests/validation_test.yaml b/charts/feature-application-observability/tests/validation_test.yaml new file mode 100644 index 000000000..f65752219 --- /dev/null +++ b/charts/feature-application-observability/tests/validation_test.yaml @@ -0,0 +1,19 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test validation +templates: + - configmap.yaml +tests: + - it: requires at least one receiver + set: + deployAsConfigMap: true + asserts: + - failedTemplate: + errorMessage: |- + execution error at (k8s-monitoring-feature-application-observability/templates/configmap.yaml:2:4): + At least one receiver must be enabled to use Application Observability. + Please enable one. For example: + applicationObservability: + receivers: + grpc: + enabled: true + See https://github.com/grafana/k8s-monitoring-helm/blob/main/charts/feature-application-observability for more details. diff --git a/charts/feature-application-observability/values.schema.json b/charts/feature-application-observability/values.schema.json new file mode 100644 index 000000000..fdfb8fc06 --- /dev/null +++ b/charts/feature-application-observability/values.schema.json @@ -0,0 +1,219 @@ +{ + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "deployAsConfigMap": { + "type": "boolean" + }, + "fullnameOverride": { + "type": "string" + }, + "logs": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "filters": { + "type": "object", + "properties": { + "log_record": { + "type": "array" + } + } + }, + "transforms": { + "type": "object", + "properties": { + "labels": { + "type": "array", + "items": { + "type": "string" + } + }, + "log": { + "type": "array" + }, + "resource": { + "type": "array" + } + } + } + } + }, + "metrics": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "filters": { + "type": "object", + "properties": { + "datapoint": { + "type": "array" + }, + "metric": { + "type": "array" + } + } + }, + "transforms": { + "type": "object", + "properties": { + "datapoint": { + "type": "array" + }, + "metric": { + "type": "array" + }, + "resource": { + "type": "array" + } + } + } + } + }, + "nameOverride": { + "type": "string" + }, + "processors": { + "type": "object", + "properties": { + "batch": { + "type": "object", + "properties": { + "maxSize": { + "type": "integer" + }, + "size": { + "type": "integer" + }, + "timeout": { + "type": "string" + } + } + }, + "grafanaCloudMetrics": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, + "k8sattributes": { + "type": "object", + "properties": { + "annotations": { + "type": "array" + }, + "labels": { + "type": "array" + }, + "metadata": { + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "memoryLimiter": { + "type": "object", + "properties": { + "checkInterval": { + "type": "string" + }, + "enabled": { + "type": "boolean" + }, + "limit": { + "type": "string" + } + } + } + } + }, + "receivers": { + "type": "object", + "properties": { + "grpc": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "include_debug_metrics": { + "type": "boolean" + }, + "port": { + "type": "integer" + } + } + }, + "http": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "include_debug_metrics": { + "type": "boolean" + }, + "port": { + "type": "integer" + } + } + }, + "zipkin": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "include_debug_metrics": { + "type": "boolean" + }, + "port": { + "type": "integer" + } + } + } + } + }, + "traces": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "filters": { + "type": "object", + "properties": { + "span": { + "type": "array" + }, + "spanevent": { + "type": "array" + } + } + }, + "transforms": { + "type": "object", + "properties": { + "resource": { + "type": "array" + }, + "span": { + "type": "array" + }, + "spanevent": { + "type": "array" + } + } + } + } + } + } +} diff --git a/charts/feature-application-observability/values.yaml b/charts/feature-application-observability/values.yaml new file mode 100644 index 000000000..82c10ab98 --- /dev/null +++ b/charts/feature-application-observability/values.yaml @@ -0,0 +1,120 @@ +--- +# -- Name override +# @section -- General settings +nameOverride: "" + +# -- Full name override +# @section -- General settings +fullnameOverride: "" + +receivers: + grpc: + enabled: false + port: 4317 + include_debug_metrics: false + http: + enabled: false + port: 4318 + include_debug_metrics: false + zipkin: + enabled: false + port: 9411 + include_debug_metrics: false + +processors: + grafanaCloudMetrics: + # -- Generate host info metrics from telemetry data, used in Application Observability in Grafana Cloud. + # @section -- Processors: Grafana Cloud Host Info + enabled: true + + batch: + # -- What batch size to use, in bytes + # @section -- Processors: Batch + size: 16384 + # -- The upper limit of the amount of data contained in a single batch, in bytes. When set to 0, batches can be any size. + # @section -- Processors: Batch + maxSize: 0 + # -- How long before sending (Processors) + # @section -- Processors: Batch + timeout: 2s + + k8sattributes: + # -- Kubernetes metadata to extract and add to the attributes of the received telemetry data. + # @section -- Processors: K8s Attributes + metadata: + - k8s.namespace.name + - k8s.pod.name + - k8s.deployment.name + - k8s.statefulset.name + - k8s.daemonset.name + - k8s.cronjob.name + - k8s.job.name + - k8s.node.name + - k8s.pod.uid + - k8s.pod.start_time + + # -- Kubernetes labels to extract and add to the attributes of the received telemetry data. + # @section -- Processors: K8s Attributes + labels: [] + + # -- Kubernetes annotations to extract and add to the attributes of the received telemetry data. + # @section -- Processors: K8s Attributes + annotations: [] + + memoryLimiter: + # -- Use a memory limiter. + # @section -- Processors: Memory Limiter + enabled: false + # -- How often to check memory usage. + # @section -- Processors: Memory Limiter + checkInterval: 1s + # -- Maximum amount of memory targeted to be allocated by the process heap. + # @section -- Processors: Memory Limiter + limit: 0MiB + +metrics: + enabled: true + # -- Apply a filter to metrics received via the OTLP or OTLP HTTP receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) + filters: + metric: [] + datapoint: [] + # -- Apply a transformation to metrics received via the OTLP or OTLP HTTP receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) + transforms: + resource: [] + metric: [] + datapoint: [] + +logs: + enabled: true + # -- Apply a filter to logs received via receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) + filters: + log_record: [] + # -- Apply a transformation to logs received via the OTLP or OTLP HTTP receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) + transforms: + # -- Resource transformation rules. + resource: [] + # -- Log transformation rules. + log: [] + # -- The list of labels to set in the log stream. + labels: ["cluster", "namespace", "job", "pod"] + +traces: + enabled: true + # -- Apply a filter to traces received via the OTLP or OTLP HTTP receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.filter/)) + filters: + span: [] + spanevent: [] + # -- Apply a transformation to traces received via the OTLP or OTLP HTTP receivers. + # ([docs](https://grafana.com/docs/alloy/latest/reference/components/otelcol.processor.transform/)) + transforms: + resource: [] + span: [] + spanevent: [] + +# @ignore +deployAsConfigMap: false diff --git a/charts/feature-cluster-events/.helmignore b/charts/feature-cluster-events/.helmignore new file mode 100644 index 000000000..2b29eaf56 --- /dev/null +++ b/charts/feature-cluster-events/.helmignore @@ -0,0 +1,6 @@ +docs +schema-mods +tests +Makefile +README.md +README.md.gotmpl diff --git a/charts/feature-cluster-events/Chart.lock b/charts/feature-cluster-events/Chart.lock new file mode 100644 index 000000000..1e36a5261 --- /dev/null +++ b/charts/feature-cluster-events/Chart.lock @@ -0,0 +1,3 @@ +dependencies: [] +digest: sha256:643d5437104296e21d906ecb15b2c96ad278f20cfc4af53b12bb6069bd853726 +generated: "2024-08-21T14:40:45.012164-05:00" diff --git a/charts/feature-cluster-events/Chart.yaml b/charts/feature-cluster-events/Chart.yaml new file mode 100644 index 000000000..314c88c6b --- /dev/null +++ b/charts/feature-cluster-events/Chart.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v2 +name: k8s-monitoring-feature-cluster-events +description: Gathers Kubernetes Events +icon: https://raw.githubusercontent.com/grafana/grafana/main/public/img/grafana_icon.svg +version: 1.0.0 +appVersion: 1.0.0 +maintainers: + - email: pete.wall@grafana.com + name: petewall +dependencies: [] diff --git a/charts/feature-cluster-events/Makefile b/charts/feature-cluster-events/Makefile new file mode 100644 index 000000000..605b55098 --- /dev/null +++ b/charts/feature-cluster-events/Makefile @@ -0,0 +1,34 @@ +HAS_HELM_DOCS := $(shell command -v helm-docs;) +HAS_HELM_UNITTEST := $(shell helm plugin list | grep unittest 2> /dev/null) + +.SECONDEXPANSION: +README.md: values.yaml Chart.yaml $$(wildcard README.md.gotmpl) +ifdef HAS_HELM_DOCS + helm-docs +else + docker run --rm --volume "$(shell pwd):/helm-docs" -u $(shell id -u) jnorwood/helm-docs:latest +endif + +Chart.lock: Chart.yaml + helm dependency update . + @touch Chart.lock # Ensure the timestamp is updated + +values.schema.json: values.yaml $$(wildcard schema-mods/*) + ../../scripts/schema-gen.sh . + +.PHONY: clean +clean: + rm -f README.md values.schema.json + +.PHONY: build +build: README.md Chart.lock values.schema.json + +.PHONY: test +test: build + helm lint . + ct lint --lint-conf ../../.configs/lintconf.yaml --check-version-increment=false --charts . +ifdef HAS_HELM_UNITTEST + helm unittest . +else + docker run --rm --volume $(shell pwd):/apps helmunittest/helm-unittest . +endif diff --git a/charts/feature-cluster-events/README.md b/charts/feature-cluster-events/README.md new file mode 100644 index 000000000..11364e731 --- /dev/null +++ b/charts/feature-cluster-events/README.md @@ -0,0 +1,36 @@ +# k8s-monitoring-feature-cluster-events + +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![AppVersion: 1.0.0](https://img.shields.io/badge/AppVersion-1.0.0-informational?style=flat-square) + +Gathers Kubernetes Events + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| petewall | | | + +## Values + +### Processing settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| extraProcessingStages | string | `""` | Stage blocks to be added to the loki.process component for cluster events. ([docs](https://grafana.com/docs/alloy/latest/reference/components/loki.process/#blocks)) This value is templated so that you can refer to other values from this file. | + +### General settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| fullnameOverride | string | `""` | Full name override | +| nameOverride | string | `""` | Name override | + +### Gather settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| logFormat | string | `"logfmt"` | Log format used to forward cluster events. Allowed values: `logfmt` (default), `json`. | +| namespaces | list | `[]` | List of namespaces to watch for events (`[]` means all namespaces) | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.14.2](https://github.com/norwoodj/helm-docs/releases/v1.14.2) diff --git a/charts/feature-cluster-events/templates/_helpers.tpl b/charts/feature-cluster-events/templates/_helpers.tpl new file mode 100644 index 000000000..a28241213 --- /dev/null +++ b/charts/feature-cluster-events/templates/_helpers.tpl @@ -0,0 +1,17 @@ +{{/* +Create a default fully qualified name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "feature.clusterEvents.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride | lower }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" | lower }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" | lower }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/feature-cluster-events/templates/_module.alloy.tpl b/charts/feature-cluster-events/templates/_module.alloy.tpl new file mode 100644 index 000000000..f53bb8ac7 --- /dev/null +++ b/charts/feature-cluster-events/templates/_module.alloy.tpl @@ -0,0 +1,23 @@ +{{- define "feature.clusterEvents.module" }} +declare "cluster_events" { + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "{{ .Values.logFormat }}" + {{- if .Values.namespaces }} + namespaces = {{ .Values.namespaces | toJson }} + {{- end }} +{{- if .Values.extraProcessingStages }} + forward_to = loki.process.cluster_events.receiver + } + + loki.process "cluster_events" { +{{ .Values.extraProcessingStages | indent 4 }} +{{- end }} + forward_to = argument.logs_destinations.value + } +} +{{- end -}} \ No newline at end of file diff --git a/charts/feature-cluster-events/templates/_notes.tpl b/charts/feature-cluster-events/templates/_notes.tpl new file mode 100644 index 000000000..6447c8e4d --- /dev/null +++ b/charts/feature-cluster-events/templates/_notes.tpl @@ -0,0 +1,7 @@ +{{- define "feature.clusterEvents.notes.deployments" }}{{- end }} + +{{- define "feature.clusterEvents.notes.task" }} +Gather Kubernetes Cluster events{{- if .Values.namespaces }} from the namespaces {{ .Values.namespaces | join "," }}{{- end }} +{{- end }} + +{{- define "feature.clusterEvents.notes.actions" }}{{- end }} diff --git a/charts/feature-cluster-events/templates/configmap.yaml b/charts/feature-cluster-events/templates/configmap.yaml new file mode 100644 index 000000000..24a5b4f64 --- /dev/null +++ b/charts/feature-cluster-events/templates/configmap.yaml @@ -0,0 +1,11 @@ +{{- if .Values.deployAsConfigMap }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "feature.clusterEvents.fullname" . }} + namespace: {{ .Release.Namespace }} +data: + module.alloy: |- + {{- include "feature.clusterEvents.module" . | indent 4 }} +{{- end }} diff --git a/charts/feature-cluster-events/tests/__snapshot__/.gitkeep b/charts/feature-cluster-events/tests/__snapshot__/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/charts/feature-cluster-events/tests/default_test.yaml b/charts/feature-cluster-events/tests/default_test.yaml new file mode 100644 index 000000000..b1aa3d6c9 --- /dev/null +++ b/charts/feature-cluster-events/tests/default_test.yaml @@ -0,0 +1,25 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test default values +templates: + - configmap.yaml +tests: + - it: should create a ConfigMap + set: + deployAsConfigMap: true + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "cluster_events" { + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "logfmt" + forward_to = argument.logs_destinations.value + } + } diff --git a/charts/feature-cluster-events/tests/extra_processing_stages_test.yaml b/charts/feature-cluster-events/tests/extra_processing_stages_test.yaml new file mode 100644 index 000000000..fc1ceae79 --- /dev/null +++ b/charts/feature-cluster-events/tests/extra_processing_stages_test.yaml @@ -0,0 +1,38 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test extra processing stages +templates: + - configmap.yaml +tests: + - it: should create a ConfigMap + set: + deployAsConfigMap: true + extraProcessingStages: |- + stage.drop { + source = "namespace" + value = "private" + } + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "cluster_events" { + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "logfmt" + forward_to = loki.process.cluster_events.receiver + } + + loki.process "cluster_events" { + stage.drop { + source = "namespace" + value = "private" + } + forward_to = argument.logs_destinations.value + } + } diff --git a/charts/feature-cluster-events/tests/namespace_test.yaml b/charts/feature-cluster-events/tests/namespace_test.yaml new file mode 100644 index 000000000..7b426ddb8 --- /dev/null +++ b/charts/feature-cluster-events/tests/namespace_test.yaml @@ -0,0 +1,27 @@ +# yamllint disable rule:document-start rule:line-length rule:trailing-spaces +suite: Test namespaces +templates: + - configmap.yaml +tests: + - it: should create a ConfigMap + set: + deployAsConfigMap: true + namespaces: ["a", "b"] + asserts: + - isKind: + of: ConfigMap + - equal: + path: data["module.alloy"] + value: |- + declare "cluster_events" { + argument "logs_destinations" { + comment = "Must be a list of log destinations where collected logs should be forwarded to" + } + + loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "logfmt" + namespaces = ["a","b"] + forward_to = argument.logs_destinations.value + } + } diff --git a/charts/feature-cluster-events/values.schema.json b/charts/feature-cluster-events/values.schema.json new file mode 100644 index 000000000..37c56b68e --- /dev/null +++ b/charts/feature-cluster-events/values.schema.json @@ -0,0 +1,24 @@ +{ + "$schema": "http://json-schema.org/schema#", + "type": "object", + "properties": { + "deployAsConfigMap": { + "type": "boolean" + }, + "extraProcessingStages": { + "type": "string" + }, + "fullnameOverride": { + "type": "string" + }, + "logFormat": { + "type": "string" + }, + "nameOverride": { + "type": "string" + }, + "namespaces": { + "type": "array" + } + } +} diff --git a/charts/feature-cluster-events/values.yaml b/charts/feature-cluster-events/values.yaml new file mode 100644 index 000000000..3d083cc15 --- /dev/null +++ b/charts/feature-cluster-events/values.yaml @@ -0,0 +1,25 @@ +--- +# -- Name override +# @section -- General settings +nameOverride: "" + +# -- Full name override +# @section -- General settings +fullnameOverride: "" + +# -- List of namespaces to watch for events (`[]` means all namespaces) +# @section -- Gather settings +namespaces: [] + +# -- Log format used to forward cluster events. Allowed values: `logfmt` (default), `json`. +# @section -- Gather settings +logFormat: logfmt + +# -- Stage blocks to be added to the loki.process component for cluster events. +# ([docs](https://grafana.com/docs/alloy/latest/reference/components/loki.process/#blocks)) +# This value is templated so that you can refer to other values from this file. +# @section -- Processing settings +extraProcessingStages: "" + +# @ignore +deployAsConfigMap: false diff --git a/charts/feature-cluster-metrics/.ct.yaml b/charts/feature-cluster-metrics/.ct.yaml new file mode 100644 index 000000000..7a5388894 --- /dev/null +++ b/charts/feature-cluster-metrics/.ct.yaml @@ -0,0 +1,4 @@ +--- +chart-repos: + - kepler=https://sustainable-computing-io.github.io/kepler-helm-chart + - prometheus-community=https://prometheus-community.github.io/helm-charts diff --git a/charts/feature-cluster-metrics/.helmignore b/charts/feature-cluster-metrics/.helmignore new file mode 100644 index 000000000..2b29eaf56 --- /dev/null +++ b/charts/feature-cluster-metrics/.helmignore @@ -0,0 +1,6 @@ +docs +schema-mods +tests +Makefile +README.md +README.md.gotmpl diff --git a/charts/feature-cluster-metrics/Chart.lock b/charts/feature-cluster-metrics/Chart.lock new file mode 100644 index 000000000..9fab8bf6c --- /dev/null +++ b/charts/feature-cluster-metrics/Chart.lock @@ -0,0 +1,15 @@ +dependencies: +- name: kube-state-metrics + repository: https://prometheus-community.github.io/helm-charts + version: 5.25.1 +- name: prometheus-node-exporter + repository: https://prometheus-community.github.io/helm-charts + version: 4.39.0 +- name: prometheus-windows-exporter + repository: https://prometheus-community.github.io/helm-charts + version: 0.5.1 +- name: kepler + repository: https://sustainable-computing-io.github.io/kepler-helm-chart + version: 0.5.9 +digest: sha256:a4fcfa6c94a3443665921ad79e766dccb8439d9c3d2f4477bc76cd7e216e3146 +generated: "2024-10-01T10:33:51.190122-05:00" diff --git a/charts/feature-cluster-metrics/Chart.yaml b/charts/feature-cluster-metrics/Chart.yaml new file mode 100644 index 000000000..2ef061e15 --- /dev/null +++ b/charts/feature-cluster-metrics/Chart.yaml @@ -0,0 +1,32 @@ +--- +apiVersion: v2 +name: k8s-monitoring-feature-cluster-metrics +description: Gathers Kubernetes Cluster metrics +icon: https://raw.githubusercontent.com/grafana/grafana/main/public/img/grafana_icon.svg +version: 1.0.0 +appVersion: 1.0.0 +maintainers: + - email: pete.wall@grafana.com + name: petewall +dependencies: + - name: kube-state-metrics + version: 5.25.1 + repository: https://prometheus-community.github.io/helm-charts + condition: kube-state-metrics.deploy + + - alias: node-exporter + name: prometheus-node-exporter + version: 4.39.0 + repository: https://prometheus-community.github.io/helm-charts + condition: node-exporter.deploy + + - alias: windows-exporter + name: prometheus-windows-exporter + version: 0.5.1 + repository: https://prometheus-community.github.io/helm-charts + condition: windows-exporter.deploy + + - name: kepler + version: 0.5.9 + repository: https://sustainable-computing-io.github.io/kepler-helm-chart + condition: kepler.enabled diff --git a/charts/feature-cluster-metrics/Makefile b/charts/feature-cluster-metrics/Makefile new file mode 100644 index 000000000..e465f2aa3 --- /dev/null +++ b/charts/feature-cluster-metrics/Makefile @@ -0,0 +1,34 @@ +HAS_HELM_DOCS := $(shell command -v helm-docs;) +HAS_HELM_UNITTEST := $(shell helm plugin list | grep unittest 2> /dev/null) + +.SECONDEXPANSION: +README.md: values.yaml Chart.yaml $$(wildcard README.md.gotmpl) +ifdef HAS_HELM_DOCS + helm-docs +else + docker run --rm --volume "$(shell pwd):/helm-docs" -u $(shell id -u) jnorwood/helm-docs:latest +endif + +Chart.lock: Chart.yaml + helm dependency update . + @touch Chart.lock # Ensure the timestamp is updated + +values.schema.json: values.yaml $$(wildcard schema-mods/*) + ../../scripts/schema-gen.sh . + +.PHONY: clean +clean: + rm -f README.md values.schema.json + +.PHONY: build +build: README.md Chart.lock values.schema.json + +.PHONY: test +test: build + helm lint . + ct lint --config .ct.yaml --lint-conf ../../.configs/lintconf.yaml --check-version-increment=false --charts . +ifdef HAS_HELM_UNITTEST + helm unittest . +else + docker run --rm --volume $(shell pwd):/apps helmunittest/helm-unittest . +endif diff --git a/charts/feature-cluster-metrics/README.md b/charts/feature-cluster-metrics/README.md new file mode 100644 index 000000000..b2b12632e --- /dev/null +++ b/charts/feature-cluster-metrics/README.md @@ -0,0 +1,324 @@ + + +# k8s-monitoring-feature-cluster-metrics + +![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![AppVersion: 1.0.0](https://img.shields.io/badge/AppVersion-1.0.0-informational?style=flat-square) + +Gathers Kubernetes Cluster metrics + +This chart deploys the Cluster Metrics feature of the Kubernetes Observability Helm chart. It includes the ability to +collect metrics from the Kubernetes Cluster itself, from sources like the Kubelet and cAdvisor, from common supporting +services like [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) and +[Node Exporter](https://github.com/prometheus/node_exporter), and from systems to capture additional data like Kepler. + +## Metric systems + +The Cluster Metrics feature of the Kubernetes Observability Helm chart includes the following metric systems: + +* Kubelet +* cAdvisor +* API Server +* Kube Controller Manager +* Kube Proxy +* Kube Scheduler +* kube-state-metrics +* Node Exporter +* Windows Exporter +* Kepler + +### Kubelet + +Kubelet metrics gather information about Kubernetes information on each node. + +The kubelet metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kubelet.yaml](./default-allow-lists/kubelet.yaml). + +### cAdvisor + +[cAdvisor](https://github.com/google/cadvisor) metrics gather information about containers on each node. + +The cAdvisor metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/cadvisor.yaml](./default-allow-lists/cadvisor.yaml). + +### API Server + +API Server metrics gather information about the Kubernetes API Server. + +### Kube Controller Manager + +Kube Controller Manager metrics gather information about the Kubernetes Controller Manager. + +### Kube Proxy + +Kube Proxy metrics gather information about the Kubernetes Proxy. + +### Kube Scheduler + +Kube Scheduler metrics gather information about the Kubernetes Scheduler. + +### kube-state-metrics + +[kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) metrics gather information about Kubernetes +resources inside the cluster. + +The kube-state-metrics metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kube-state-metrics.yaml](./default-allow-lists/kube-state-metrics.yaml). + +### Node Exporter + +[Node Exporter](https://github.com/prometheus/node_exporter) metrics gather information about Linux Kubernetes Nodes. + +The Node Exporter metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/node-exporter.yaml](./default-allow-lists/node-exporter.yaml), and has an integration allow list, +[default-allow-lists/node-exporter-integration.yaml](./default-allow-lists/node-exporter-integration.yaml). + +### Windows Exporter + +[Windows Exporter](https://github.com/prometheus-community/windows_exporter) metrics gather information about Windows +Kubernetes Nodes. + +The Windows Exporter metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/windows-exporter.yaml](./default-allow-lists/windows-exporter.yaml). + +### Kepler + +[Kepler](https://sustainable-computing.io/) metrics gather information about the Kubernetes cluster. + +The Kepler metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kepler.yaml](./default-allow-lists/kepler.yaml). + +## Metrics Tuning & Allow Lists + +All metric sources have the ability to adjust the amount of metrics being scraped and their labels. This can be useful +to limit the number of metrics delivered to your destinations. Many of the metric sources also have an allow list, which +is a set of metric names that will be kept, while any metrics not on the list will be dropped. The allow lists are tuned +to return a useful, but minimal set of metrics for typical use cases. Some sources have an "integration allow list", +which contains even more metrics for diving into the details of the source itself. + +To control these settings, use the `metricsTuning` section in the values file. + +```yaml +: + metricsTuning: + useDefaultAllowList: # Use the allow list for this metric source + useIntegrationAllowList: # Use the integration allow list for this metric source + includeMetrics: [] # Metrics to be kept + excludeMetrics: [] # Metrics to be dropped +``` + +The behavior of the combination of these settings is shown in this table: + +| Allow List | includeMetrics | excludeMetrics | Result | +|------------|------------------|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| true | `[]` | `[]` | Use the allow list metric list | +| false | `[]` | `[]` | No filter, keep all metrics | +| true | `[my_metric]` | `[]` | Use the allow list metric list with an additional metric | +| false | `[my_metric_.*]` | `[]` | *Only* keep metrics that start with `my_metric_` | +| true | `[]` | `[my_metric_.*]` | Use the allow list metric filter, but exclude anything that starts with `my_metric_` | +| false | `[]` | `[my_metric_.*]` | Keep all metrics except anything that starts with `my_metric_` | +| true | `[my_metric_.*]` | `[other_metric_.*]` | Use the allow list metric filter, and keep anything that starts with `my_metric_`, but remove anything that starts with `other_metric_` | +| false | `[my_metric_.*]` | `[my_metric_not_needed]` | *Only* keep metrics that start with `my_metric_`, but remove any that are named `my_metric_not_needed` | + +In addition to all fo this, you can also use the `extraMetricProcessingRules` section to add arbitrary relabeling rules that can be used to take any +action on the metric list, including filtering based on label or other actions. + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| petewall | | | + + +## Requirements + +| Repository | Name | Version | +|------------|------|---------| +| https://prometheus-community.github.io/helm-charts | kube-state-metrics | 5.25.1 | +| https://prometheus-community.github.io/helm-charts | node-exporter(prometheus-node-exporter) | 4.39.0 | +| https://prometheus-community.github.io/helm-charts | windows-exporter(prometheus-windows-exporter) | 0.5.1 | +| https://sustainable-computing-io.github.io/kepler-helm-chart | kepler | 0.5.9 | + + + +## Values + +### API Server + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| apiServer.enabled | string | false | Scrape metrics from the API Server | +| apiServer.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for the API Server. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| apiServer.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for the API Server. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| apiServer.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize | +| apiServer.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| apiServer.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. An empty list means keep all. | +| apiServer.scrapeInterval | string | 60s | How frequently to scrape metrics from the API Server Overrides metrics.scrapeInterval | + +### cAdvisor + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| cadvisor.enabled | bool | `true` | Scrape metrics from cAdvisor. | +| cadvisor.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for cAdvisor entities. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| cadvisor.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for cAdvisor metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| cadvisor.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| cadvisor.metricsTuning.dropEmptyContainerLabels | bool | `true` | Drop metrics that have an empty container label | +| cadvisor.metricsTuning.dropEmptyImageLabels | bool | `true` | Drop metrics that have an empty image label | +| cadvisor.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| cadvisor.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| cadvisor.metricsTuning.keepPhysicalFilesystemDevices | list | `["mmcblk.p.+","nvme.+","rbd.+","sd.+","vd.+","xvd.+","dasd.+"]` | Only keep filesystem metrics that use the following physical devices | +| cadvisor.metricsTuning.keepPhysicalNetworkDevices | list | `["en[ospx][0-9].*","wlan[0-9].*","eth[0-9].*"]` | Only keep network metrics that use the following physical devices | +| cadvisor.metricsTuning.normalizeUnnecessaryLabels | list | `[{"labels":["boot_id","system_uuid"],"metric":"machine_memory_bytes"}]` | Normalize labels to the same value for the given metric and label pairs | +| cadvisor.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from cAdvisor to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | + +### cadvisor + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| cadvisor.scrapeInterval | string | `60s` | How frequently to scrape cAdvisor metrics. | + +### Control Plane + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| controlPlane.enabled | bool | `false` | enable all Kubernetes Control Plane metrics sources. This includes api-server, kube-scheduler, kube-controller-manager, and etcd. | + +### General settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| fullnameOverride | string | `""` | Full name override | +| nameOverride | string | `""` | Name override | + +### Global Settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| global.maxCacheSize | int | `100000` | Sets the max_cache_size for every prometheus.relabel component. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) This should be at least 2x-5x your largest scrape target or samples appended rate. | +| global.platform | string | `""` | The specific platform for this cluster. Will enable compatibility for some platforms. Supported options: (empty) or "openshift". | +| global.scrapeInterval | string | `"60s"` | How frequently to scrape metrics. | + +### Kepler + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kepler.enabled | bool | `false` | Deploy and scrape Kepler metrics. | +| kepler.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for Kepler. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with __ (i.e. __meta_kubernetes*) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| kepler.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Kepler. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no __meta* labels are present. | +| kepler.labelMatchers | object | `{"app.kubernetes.io/name":"kepler"}` | Label matchers used to select the Kepler pods | +| kepler.maxCacheSize | string | `100000` | Sets the max_cache_size for the prometheus.relabel component for Kepler. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| kepler.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kepler.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| kepler.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kepler to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | +| kepler.scrapeInterval | string | `60s` | How frequently to scrape metrics from Kepler. Overrides global.scrapeInterval. | + +### kube-state-metrics + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kube-state-metrics.deploy | bool | `true` | Deploy kube-state-metrics. Set to false if your cluster already has kube-state-metrics deployed. | +| kube-state-metrics.enabled | bool | `true` | Scrape metrics from kube-state-metrics. | +| kube-state-metrics.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for kube-state-metrics metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| kube-state-metrics.labelMatchers | object | `{"app.kubernetes.io/name":"kube-state-metrics"}` | Labels used to select the kube-state-metrics service. | +| kube-state-metrics.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| kube-state-metrics.metricLabelsAllowlist | list | `["nodes=[*]"]` | `kube__labels` metrics to generate. | +| kube-state-metrics.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kube-state-metrics.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| kube-state-metrics.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Kube State Metrics to a useful, minimal set. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | +| kube-state-metrics.scrapeInterval | string | `60s` | How frequently to scrape kube-state-metrics metrics. | + +### Kube Controller Manager + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kubeControllerManager.enabled | string | false | Scrape metrics from the Kube Controller Manager | +| kubeControllerManager.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for the Kube Controller Manager. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| kubeControllerManager.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for the Kube Controller Manager. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| kubeControllerManager.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize | +| kubeControllerManager.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kubeControllerManager.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. An empty list means keep all. | +| kubeControllerManager.port | int | `10257` | Port number used by the Kube Controller Manager, set by `--secure-port.` | +| kubeControllerManager.scrapeInterval | string | 60s | How frequently to scrape metrics from the Kube Controller Manager Overrides metrics.scrapeInterval | + +### Kube Proxy + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kubeProxy.enabled | string | false | Scrape metrics from the Kube Proxy | +| kubeProxy.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for the Kube Proxy. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| kubeProxy.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for the Kube Proxy. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| kubeProxy.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize | +| kubeProxy.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kubeProxy.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. An empty list means keep all. | +| kubeProxy.port | int | `10249` | Port number used by the Kube Proxy, set in `--metrics-bind-address`. | +| kubeProxy.scrapeInterval | string | 60s | How frequently to scrape metrics from the Kube Proxy Overrides metrics.scrapeInterval | + +### Kube Scheduler + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kubeScheduler.enabled | string | false | Scrape metrics from the Kube Scheduler | +| kubeScheduler.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for the Kube Scheduler. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| kubeScheduler.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for the Kube Scheduler. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| kubeScheduler.maxCacheSize | string | `nil` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides metrics.maxCacheSize | +| kubeScheduler.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kubeScheduler.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. An empty list means keep all. | +| kubeScheduler.port | int | `10259` | Port number used by the Kube Scheduler, set by `--secure-port`. | +| kubeScheduler.scrapeInterval | string | 60s | How frequently to scrape metrics from the Kube Scheduler Overrides metrics.scrapeInterval | + +### Kubelet + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| kubelet.enabled | bool | `true` | Scrape metrics from kubelet. | +| kubelet.extraDiscoveryRules | string | `""` | Rule blocks to be added to the discovery.relabel component for Kubelet entities. These relabeling rules are applied pre-scrape against the targets from service discovery. Before the scrape, any remaining target labels that start with `__` (i.e. `__meta_kubernetes*`) are dropped. ([docs](https://grafana.com/docs/alloy/latest/reference/components/discovery.relabel/#rule-block)) | +| kubelet.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Kubelet metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| kubelet.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| kubelet.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| kubelet.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| kubelet.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from the Kubelet to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | +| kubelet.scrapeInterval | string | `60s` | How frequently to scrape Kubelet metrics. | + +### Node Exporter - Deployment settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| node-exporter.deploy | bool | `true` | Deploy Node Exporter. Set to false if your cluster already has Node Exporter deployed. | + +### Node Exporter + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| node-exporter.enabled | bool | `true` | Scrape metrics from Node Exporter. | +| node-exporter.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Node Exporter metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| node-exporter.labelMatchers | object | `{"app.kubernetes.io/name":"node-exporter"}` | Labels used to select the Node Exporter pods. | +| node-exporter.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| node-exporter.metricsTuning.dropMetricsForFilesystem | list | `["tempfs"]` | Drop metrics for the given filesystem types | +| node-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| node-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| node-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | +| node-exporter.metricsTuning.useIntegrationAllowList | bool | `false` | Filter the list of metrics from Node Exporter to the minimal set required for Kubernetes Monitoring as well as the Node Exporter integration. | +| node-exporter.scrapeInterval | string | `60s` | How frequently to scrape Node Exporter metrics. | + +### Windows Exporter - Deployment settings + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| windows-exporter.deploy | bool | `true` | Deploy Windows Exporter. Set to false if your cluster already has Windows Exporter deployed. | + +### Windows Exporter + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| windows-exporter.enabled | bool | `true` | Scrape node metrics | +| windows-exporter.extraMetricProcessingRules | string | `""` | Rule blocks to be added to the prometheus.relabel component for Windows Exporter metrics. These relabeling rules are applied post-scrape against the metrics returned from the scraped target, no `__meta*` labels are present. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#rule-block)) | +| windows-exporter.labelMatchers | object | `{"app.kubernetes.io/name":"windows-exporter"}` | Labels used to select the Windows Exporter pods. | +| windows-exporter.maxCacheSize | string | `100000` | Sets the max_cache_size for cadvisor prometheus.relabel component. This should be at least 2x-5x your largest scrape target or samples appended rate. ([docs](https://grafana.com/docs/alloy/latest/reference/components/prometheus.relabel/#arguments)) Overrides global.maxCacheSize | +| windows-exporter.metricsTuning.excludeMetrics | list | `[]` | Metrics to drop. Can use regular expressions. | +| windows-exporter.metricsTuning.includeMetrics | list | `[]` | Metrics to keep. Can use regular expressions. | +| windows-exporter.metricsTuning.useDefaultAllowList | bool | `true` | Filter the list of metrics from Windows Exporter to the minimal set required for Kubernetes Monitoring. See [Metrics Tuning and Allow Lists](#metrics-tuning-and-allow-lists) | +| windows-exporter.scrapeInterval | string | `60s` | How frequently to scrape metrics from Windows Exporter. | + diff --git a/charts/feature-cluster-metrics/README.md.gotmpl b/charts/feature-cluster-metrics/README.md.gotmpl new file mode 100644 index 000000000..673d1e606 --- /dev/null +++ b/charts/feature-cluster-metrics/README.md.gotmpl @@ -0,0 +1,141 @@ + + +{{ template "chart.header" . }} +{{ template "chart.deprecationWarning" . }} + +{{ template "chart.badgesSection" . }} + +{{ template "chart.description" . }} + +{{ template "chart.homepageLine" . }} + +This chart deploys the Cluster Metrics feature of the Kubernetes Observability Helm chart. It includes the ability to +collect metrics from the Kubernetes Cluster itself, from sources like the Kubelet and cAdvisor, from common supporting +services like [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) and +[Node Exporter](https://github.com/prometheus/node_exporter), and from systems to capture additional data like Kepler. + +## Metric systems + +The Cluster Metrics feature of the Kubernetes Observability Helm chart includes the following metric systems: + +* Kubelet +* cAdvisor +* API Server +* Kube Controller Manager +* Kube Proxy +* Kube Scheduler +* kube-state-metrics +* Node Exporter +* Windows Exporter +* Kepler + +### Kubelet + +Kubelet metrics gather information about Kubernetes information on each node. + +The kubelet metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kubelet.yaml](./default-allow-lists/kubelet.yaml). + +### cAdvisor + +[cAdvisor](https://github.com/google/cadvisor) metrics gather information about containers on each node. + +The cAdvisor metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/cadvisor.yaml](./default-allow-lists/cadvisor.yaml). + +### API Server + +API Server metrics gather information about the Kubernetes API Server. + +### Kube Controller Manager + +Kube Controller Manager metrics gather information about the Kubernetes Controller Manager. + +### Kube Proxy + +Kube Proxy metrics gather information about the Kubernetes Proxy. + +### Kube Scheduler + +Kube Scheduler metrics gather information about the Kubernetes Scheduler. + +### kube-state-metrics + +[kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) metrics gather information about Kubernetes +resources inside the cluster. + +The kube-state-metrics metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kube-state-metrics.yaml](./default-allow-lists/kube-state-metrics.yaml). + +### Node Exporter + +[Node Exporter](https://github.com/prometheus/node_exporter) metrics gather information about Linux Kubernetes Nodes. + +The Node Exporter metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/node-exporter.yaml](./default-allow-lists/node-exporter.yaml), and has an integration allow list, +[default-allow-lists/node-exporter-integration.yaml](./default-allow-lists/node-exporter-integration.yaml). + +### Windows Exporter + +[Windows Exporter](https://github.com/prometheus-community/windows_exporter) metrics gather information about Windows +Kubernetes Nodes. + +The Windows Exporter metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/windows-exporter.yaml](./default-allow-lists/windows-exporter.yaml). + +### Kepler + +[Kepler](https://sustainable-computing.io/) metrics gather information about the Kubernetes cluster. + +The Kepler metric source uses an [allow list](#metrics-tuning--allow-lists), +[default-allow-lists/kepler.yaml](./default-allow-lists/kepler.yaml). + +## Metrics Tuning & Allow Lists + +All metric sources have the ability to adjust the amount of metrics being scraped and their labels. This can be useful +to limit the number of metrics delivered to your destinations. Many of the metric sources also have an allow list, which +is a set of metric names that will be kept, while any metrics not on the list will be dropped. The allow lists are tuned +to return a useful, but minimal set of metrics for typical use cases. Some sources have an "integration allow list", +which contains even more metrics for diving into the details of the source itself. + +To control these settings, use the `metricsTuning` section in the values file. + +```yaml +: + metricsTuning: + useDefaultAllowList: # Use the allow list for this metric source + useIntegrationAllowList: # Use the integration allow list for this metric source + includeMetrics: [] # Metrics to be kept + excludeMetrics: [] # Metrics to be dropped +``` + +The behavior of the combination of these settings is shown in this table: + +| Allow List | includeMetrics | excludeMetrics | Result | +|------------|------------------|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| true | `[]` | `[]` | Use the allow list metric list | +| false | `[]` | `[]` | No filter, keep all metrics | +| true | `[my_metric]` | `[]` | Use the allow list metric list with an additional metric | +| false | `[my_metric_.*]` | `[]` | *Only* keep metrics that start with `my_metric_` | +| true | `[]` | `[my_metric_.*]` | Use the allow list metric filter, but exclude anything that starts with `my_metric_` | +| false | `[]` | `[my_metric_.*]` | Keep all metrics except anything that starts with `my_metric_` | +| true | `[my_metric_.*]` | `[other_metric_.*]` | Use the allow list metric filter, and keep anything that starts with `my_metric_`, but remove anything that starts with `other_metric_` | +| false | `[my_metric_.*]` | `[my_metric_not_needed]` | *Only* keep metrics that start with `my_metric_`, but remove any that are named `my_metric_not_needed` | + +In addition to all fo this, you can also use the `extraMetricProcessingRules` section to add arbitrary relabeling rules that can be used to take any +action on the metric list, including filtering based on label or other actions. + +{{ template "chart.maintainersSection" . }} + +{{ template "chart.sourcesSection" . }} + + +{{ template "chart.requirementsSection" . }} + + + +{{ template "chart.valuesSection" . }} + diff --git a/charts/feature-cluster-metrics/charts/kepler-0.5.9.tgz b/charts/feature-cluster-metrics/charts/kepler-0.5.9.tgz new file mode 100644 index 000000000..80b2f9012 Binary files /dev/null and b/charts/feature-cluster-metrics/charts/kepler-0.5.9.tgz differ diff --git a/charts/feature-cluster-metrics/charts/kube-state-metrics-5.25.1.tgz b/charts/feature-cluster-metrics/charts/kube-state-metrics-5.25.1.tgz new file mode 100644 index 000000000..6d362292d Binary files /dev/null and b/charts/feature-cluster-metrics/charts/kube-state-metrics-5.25.1.tgz differ diff --git a/charts/feature-cluster-metrics/charts/prometheus-node-exporter-4.39.0.tgz b/charts/feature-cluster-metrics/charts/prometheus-node-exporter-4.39.0.tgz new file mode 100644 index 000000000..0750218b7 Binary files /dev/null and b/charts/feature-cluster-metrics/charts/prometheus-node-exporter-4.39.0.tgz differ diff --git a/charts/feature-cluster-metrics/charts/prometheus-windows-exporter-0.5.1.tgz b/charts/feature-cluster-metrics/charts/prometheus-windows-exporter-0.5.1.tgz new file mode 100644 index 000000000..f16d0b5a0 Binary files /dev/null and b/charts/feature-cluster-metrics/charts/prometheus-windows-exporter-0.5.1.tgz differ diff --git a/charts/feature-cluster-metrics/default-allow-lists/cadvisor.yaml b/charts/feature-cluster-metrics/default-allow-lists/cadvisor.yaml new file mode 100644 index 000000000..c8db3c16b --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/cadvisor.yaml @@ -0,0 +1,20 @@ +--- +# The minimal set of metrics from cAdvisor required for Kubernetes Monitoring +- container_cpu_cfs_periods_total +- container_cpu_cfs_throttled_periods_total +- container_cpu_usage_seconds_total +- container_fs_reads_bytes_total +- container_fs_reads_total +- container_fs_writes_bytes_total +- container_fs_writes_total +- container_memory_cache +- container_memory_rss +- container_memory_swap +- container_memory_working_set_bytes +- container_network_receive_bytes_total +- container_network_receive_packets_dropped_total +- container_network_receive_packets_total +- container_network_transmit_bytes_total +- container_network_transmit_packets_dropped_total +- container_network_transmit_packets_total +- machine_memory_bytes diff --git a/charts/feature-cluster-metrics/default-allow-lists/kepler.yaml b/charts/feature-cluster-metrics/default-allow-lists/kepler.yaml new file mode 100644 index 000000000..58e46a336 --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/kepler.yaml @@ -0,0 +1,3 @@ +--- +# The minimal set of metrics from Kepler required for Kubernetes Monitoring +- kepler_.* diff --git a/charts/feature-cluster-metrics/default-allow-lists/kube-state-metrics.yaml b/charts/feature-cluster-metrics/default-allow-lists/kube-state-metrics.yaml new file mode 100644 index 000000000..a46445b90 --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/kube-state-metrics.yaml @@ -0,0 +1,30 @@ +--- +# The minimal set of metrics from kube-state-metrics required for Kubernetes Monitoring +- kube_daemonset.* +- kube_deployment_metadata_generation +- kube_deployment_spec_replicas +- kube_deployment_status_observed_generation +- kube_deployment_status_replicas_available +- kube_deployment_status_replicas_updated +- kube_horizontalpodautoscaler_spec_max_replicas +- kube_horizontalpodautoscaler_spec_min_replicas +- kube_horizontalpodautoscaler_status_current_replicas +- kube_horizontalpodautoscaler_status_desired_replicas +- kube_job.* +- kube_namespace_status_phase +- kube_node.* +- kube_persistentvolumeclaim_resource_requests_storage_bytes +- kube_pod_container_info +- kube_pod_container_resource_limits +- kube_pod_container_resource_requests +- kube_pod_container_status_last_terminated_reason +- kube_pod_container_status_restarts_total +- kube_pod_container_status_waiting_reason +- kube_pod_info +- kube_pod_owner +- kube_pod_start_time +- kube_pod_status_phase +- kube_pod_status_reason +- kube_replicaset.* +- kube_resourcequota +- kube_statefulset.* diff --git a/charts/feature-cluster-metrics/default-allow-lists/kubelet.yaml b/charts/feature-cluster-metrics/default-allow-lists/kubelet.yaml new file mode 100644 index 000000000..417fb0eb0 --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/kubelet.yaml @@ -0,0 +1,34 @@ +--- +# The minimal set of metrics from the Kubelet required for Kubernetes Monitoring +- container_cpu_usage_seconds_total +- kubelet_certificate_manager_client_expiration_renew_errors +- kubelet_certificate_manager_client_ttl_seconds +- kubelet_certificate_manager_server_ttl_seconds +- kubelet_cgroup_manager_duration_seconds_bucket +- kubelet_cgroup_manager_duration_seconds_count +- kubelet_node_config_error +- kubelet_node_name +- kubelet_pleg_relist_duration_seconds_bucket +- kubelet_pleg_relist_duration_seconds_count +- kubelet_pleg_relist_interval_seconds_bucket +- kubelet_pod_start_duration_seconds_bucket +- kubelet_pod_start_duration_seconds_count +- kubelet_pod_worker_duration_seconds_bucket +- kubelet_pod_worker_duration_seconds_count +- kubelet_running_container_count +- kubelet_running_containers +- kubelet_running_pod_count +- kubelet_running_pods +- kubelet_runtime_operations_errors_total +- kubelet_runtime_operations_total +- kubelet_server_expiration_renew_errors +- kubelet_volume_stats_available_bytes +- kubelet_volume_stats_capacity_bytes +- kubelet_volume_stats_inodes +- kubelet_volume_stats_inodes_used +- kubernetes_build_info +- namespace_workload_pod +- rest_client_requests_total +- storage_operation_duration_seconds_count +- storage_operation_errors_total +- volume_manager_total_volumes diff --git a/charts/feature-cluster-metrics/default-allow-lists/node-exporter-integration.yaml b/charts/feature-cluster-metrics/default-allow-lists/node-exporter-integration.yaml new file mode 100644 index 000000000..e2ee3383d --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/node-exporter-integration.yaml @@ -0,0 +1,111 @@ +--- +# The set of metrics from Node Exporter required for the Node Exporter integration +- node_arp_entries +- node_boot_time_seconds +- node_context_switches_total +- node_disk_io_time_seconds_total +- node_disk_io_time_weighted_seconds_total +- node_disk_read_bytes_total +- node_disk_read_time_seconds_total +- node_disk_reads_completed_total +- node_disk_write_time_seconds_total +- node_disk_writes_completed_total +- node_disk_written_bytes_total +- node_filefd_allocated +- node_filefd_maximum +- node_intr_total +- node_load1 +- node_load15 +- node_load5 +- node_md_disks +- node_md_disks_required +- node_netstat_Icmp6_InErrors +- node_netstat_Icmp6_InMsgs +- node_netstat_Icmp6_OutMsgs +- node_netstat_Icmp_InErrors +- node_netstat_Icmp_InMsgs +- node_netstat_Icmp_OutMsgs +- node_netstat_IpExt_InOctets +- node_netstat_IpExt_OutOctets +- node_netstat_TcpExt_ListenDrops +- node_netstat_TcpExt_ListenOverflows +- node_netstat_TcpExt_TCPSynRetrans +- node_netstat_Tcp_InErrs +- node_netstat_Tcp_InSegs +- node_netstat_Tcp_OutRsts +- node_netstat_Tcp_OutSegs +- node_netstat_Tcp_RetransSegs +- node_netstat_Udp6_InDatagrams +- node_netstat_Udp6_InErrors +- node_netstat_Udp6_NoPorts +- node_netstat_Udp6_OutDatagrams +- node_netstat_Udp6_RcvbufErrors +- node_netstat_Udp6_SndbufErrors +- node_netstat_UdpLite_InErrors +- node_netstat_Udp_InDatagrams +- node_netstat_Udp_InErrors +- node_netstat_Udp_NoPorts +- node_netstat_Udp_OutDatagrams +- node_netstat_Udp_RcvbufErrors +- node_netstat_Udp_SndbufErrors +- node_network_carrier +- node_network_info +- node_network_mtu_bytes +- node_network_receive_bytes_total +- node_network_receive_compressed_total +- node_network_receive_drop_total +- node_network_receive_errs_total +- node_network_receive_fifo_total +- node_network_receive_multicast_total +- node_network_receive_packets_total +- node_network_speed_bytes +- node_network_transmit_bytes_total +- node_network_transmit_compressed_total +- node_network_transmit_drop_total +- node_network_transmit_errs_total +- node_network_transmit_fifo_total +- node_network_transmit_multicast_total +- node_network_transmit_packets_total +- node_network_transmit_queue_length +- node_network_up +- node_nf_conntrack_entries +- node_nf_conntrack_entries_limit +- node_os_info +- node_sockstat_FRAG6_inuse +- node_sockstat_FRAG_inuse +- node_sockstat_RAW6_inuse +- node_sockstat_RAW_inuse +- node_sockstat_TCP6_inuse +- node_sockstat_TCP_alloc +- node_sockstat_TCP_inuse +- node_sockstat_TCP_mem +- node_sockstat_TCP_mem_bytes +- node_sockstat_TCP_orphan +- node_sockstat_TCP_tw +- node_sockstat_UDP6_inuse +- node_sockstat_UDPLITE6_inuse +- node_sockstat_UDPLITE_inuse +- node_sockstat_UDP_inuse +- node_sockstat_UDP_mem +- node_sockstat_UDP_mem_bytes +- node_sockstat_sockets_used +- node_softnet_dropped_total +- node_softnet_processed_total +- node_softnet_times_squeezed_total +- node_systemd_unit_state +- node_textfile_scrape_error +- node_time_zone_offset_seconds +- node_timex_estimated_error_seconds +- node_timex_maxerror_seconds +- node_timex_offset_seconds +- node_timex_sync_status +- node_uname_info +- node_vmstat_oom_kill +- node_vmstat_pgfault +- node_vmstat_pgmajfault +- node_vmstat_pgpgin +- node_vmstat_pgpgout +- node_vmstat_pswpin +- node_vmstat_pswpout +- process_max_fds +- process_open_fds diff --git a/charts/feature-cluster-metrics/default-allow-lists/node-exporter.yaml b/charts/feature-cluster-metrics/default-allow-lists/node-exporter.yaml new file mode 100644 index 000000000..871d9bc6f --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/node-exporter.yaml @@ -0,0 +1,12 @@ +--- +# The minimal set of metrics from the Node Exporter required for Kubernetes Monitoring +- node_cpu.* +- node_exporter_build_info +- node_filesystem.* +- node_memory.* +- node_network_receive_bytes_total +- node_network_receive_drop_total +- node_network_transmit_bytes_total +- node_network_transmit_drop_total +- process_cpu_seconds_total +- process_resident_memory_bytes diff --git a/charts/feature-cluster-metrics/default-allow-lists/opencost.yaml b/charts/feature-cluster-metrics/default-allow-lists/opencost.yaml new file mode 100644 index 000000000..e7c5ce884 --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/opencost.yaml @@ -0,0 +1,27 @@ +--- +# The minimal set of metrics from OpenCost required for Kubernetes Monitoring +- container_cpu_allocation +- container_gpu_allocation +- container_memory_allocation_bytes +- deployment_match_labels +- kubecost_cluster_info +- kubecost_cluster_management_cost +- kubecost_cluster_memory_working_set_bytes +- kubecost_http_requests_total +- kubecost_http_response_size_bytes +- kubecost_http_response_time_seconds +- kubecost_load_balancer_cost +- kubecost_network_internet_egress_cost +- kubecost_network_region_egress_cost +- kubecost_network_zone_egress_cost +- kubecost_node_is_spot +- node_cpu_hourly_cost +- node_gpu_count +- node_gpu_hourly_cost +- node_ram_hourly_cost +- node_total_hourly_cost +- opencost_build_info +- pod_pvc_allocation +- pv_hourly_cost +- service_selector_labels +- statefulSet_match_labels diff --git a/charts/feature-cluster-metrics/default-allow-lists/windows-exporter.yaml b/charts/feature-cluster-metrics/default-allow-lists/windows-exporter.yaml new file mode 100644 index 000000000..765f92300 --- /dev/null +++ b/charts/feature-cluster-metrics/default-allow-lists/windows-exporter.yaml @@ -0,0 +1,7 @@ +--- +# The minimal set of metrics from Windows Exporter required for Kubernetes Monitoring +- windows_.* +- node_cpu_seconds_total +- node_filesystem_size_bytes +- node_filesystem_avail_bytes +- container_cpu_usage_seconds_total diff --git a/charts/feature-cluster-metrics/schema-mods/remove-subchart-fields.jq b/charts/feature-cluster-metrics/schema-mods/remove-subchart-fields.jq new file mode 100644 index 000000000..45dd438ea --- /dev/null +++ b/charts/feature-cluster-metrics/schema-mods/remove-subchart-fields.jq @@ -0,0 +1 @@ +del(.properties["node-exporter"].properties.service) diff --git a/charts/feature-cluster-metrics/schema-mods/types-and-enums.json b/charts/feature-cluster-metrics/schema-mods/types-and-enums.json new file mode 100644 index 000000000..bd28efff8 --- /dev/null +++ b/charts/feature-cluster-metrics/schema-mods/types-and-enums.json @@ -0,0 +1,5 @@ +{ + "properties": { + "global": {"properties": {"platform": {"enum": ["", "openshift"]}}} + } +} diff --git a/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl b/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl new file mode 100644 index 000000000..04e3d673f --- /dev/null +++ b/charts/feature-cluster-metrics/templates/_api_server.alloy.tpl @@ -0,0 +1,33 @@ +{{ define "feature.clusterMetrics.apiServer.allowList" }} +{{ if .Values.apiServer.metricsTuning.includeMetrics }} +{{ .Values.apiServer.metricsTuning.includeMetrics | toYaml }} +{{ end }} +{{ end }} + +{{- define "feature.clusterMetrics.apiServer.alloy" }} +{{- if or .Values.apiServer.enabled (and .Values.controlPlane.enabled (not (eq .Values.apiServer.enabled false))) }} +{{- $metricAllowList := .Values.apiServer.metricsTuning.includeMetrics }} +{{- $metricDenyList := .Values.apiServer.metricsTuning.excludeMetrics }} + +kubernetes.apiserver "scrape" { + clustering = true +{{- if $metricAllowList }} + keep_metrics = "up|{{ $metricAllowList | join "|" }}" +{{- end }} +{{- if $metricDenyList }} + drop_metrics = {{ $metricDenyList | join "|" | quote }} +{{- end }} + scrape_interval = {{ .Values.cadvisor.scrapeInterval | default .Values.global.scrapeInterval | int }} + max_cache_size = {{ .Values.cadvisor.maxCacheSize | default .Values.global.maxCacheSize | int }} +{{- if .Values.apiServer.extraMetricProcessingRules }} + forward_to = [prometheus.relabel.apiServer.receiver] +} + +prometheus.relabel "apiServer" { + max_cache_size = {{ .Values.apiServer.maxCacheSize | default .Values.global.maxCacheSize | int }} + {{ .Values.apiServer.extraMetricProcessingRules | indent 2 }} +{{- end }} + forward_to = argument.metrics_destinations.value +} +{{- end }} +{{- end }} diff --git a/charts/feature-cluster-metrics/templates/_cadvisor.alloy.tpl b/charts/feature-cluster-metrics/templates/_cadvisor.alloy.tpl new file mode 100644 index 000000000..8cb9e5c11 --- /dev/null +++ b/charts/feature-cluster-metrics/templates/_cadvisor.alloy.tpl @@ -0,0 +1,109 @@ +{{ define "feature.clusterMetrics.cadvisor.allowList" }} +{{ if .Values.cadvisor.metricsTuning.useDefaultAllowList }} +{{ "default-allow-lists/cadvisor.yaml" | .Files.Get }} +{{ end }} +{{ if .Values.cadvisor.metricsTuning.includeMetrics }} +{{ .Values.cadvisor.metricsTuning.includeMetrics | toYaml }} +{{ end }} +{{ end }} + +{{- define "feature.clusterMetrics.cadvisor.alloy" }} +{{- if .Values.cadvisor.enabled }} +{{- $metricAllowList := include "feature.clusterMetrics.cadvisor.allowList" . }} +{{- $metricDenyList := .Values.cadvisor.metricsTuning.excludeMetrics }} + +kubernetes.cadvisor "scrape" { + clustering = true +{{- if $metricAllowList }} + keep_metrics = "up|{{ $metricAllowList | fromYamlArray | join "|" }}" +{{- end }} +{{- if $metricDenyList }} + drop_metrics = {{ $metricDenyList | join "|" | quote }} +{{- end }} + scrape_interval = {{ .Values.cadvisor.scrapeInterval | default .Values.global.scrapeInterval | quote }} + max_cache_size = {{ .Values.cadvisor.maxCacheSize | default .Values.global.maxCacheSize | int }} + forward_to = [prometheus.relabel.cadvisor.receiver] +} + +prometheus.relabel "cadvisor" { + max_cache_size = {{ .Values.cadvisor.maxCacheSize | default .Values.global.maxCacheSize | int }} + +{{- if .Values.cadvisor.metricsTuning.dropEmptyContainerLabels }} + // Drop empty container labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","container"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*)@" + action = "drop" + } +{{- end }} +{{- if .Values.cadvisor.metricsTuning.dropEmptyImageLabels }} + // Drop empty image labels, addressing https://github.com/google/cadvisor/issues/2688 + rule { + source_labels = ["__name__","image"] + separator = "@" + regex = "(container_cpu_.*|container_fs_.*|container_memory_.*|container_network_.*)@" + action = "drop" + } +{{- end }} +{{- if .Values.cadvisor.metricsTuning.normalizeUnnecessaryLabels }} + // Normalizing unimportant labels (not deleting to continue satisfying