-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add example otel-collector configurations #1152
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,8 @@ package e2e | |
import ( | ||
"context" | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
"testing" | ||
"time" | ||
|
@@ -28,12 +30,14 @@ import ( | |
"google.golang.org/api/iterator" | ||
"google.golang.org/genproto/googleapis/api/metric" | ||
"google.golang.org/genproto/googleapis/api/monitoredres" | ||
"gopkg.in/yaml.v2" | ||
corev1 "k8s.io/api/core/v1" | ||
apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
"k8s.io/apimachinery/pkg/types" | ||
"kpt.dev/configsync/e2e" | ||
"kpt.dev/configsync/e2e/nomostest" | ||
"kpt.dev/configsync/e2e/nomostest/iam" | ||
testmetrics "kpt.dev/configsync/e2e/nomostest/metrics" | ||
"kpt.dev/configsync/e2e/nomostest/ntopts" | ||
"kpt.dev/configsync/e2e/nomostest/retry" | ||
nomostesting "kpt.dev/configsync/e2e/nomostest/testing" | ||
|
@@ -212,6 +216,158 @@ func TestOtelCollectorDeployment(t *testing.T) { | |
} | ||
} | ||
|
||
// TestOtelCollectorSampleConfigurations validates that metrics reporting works for | ||
// Google Cloud Monitoring using the sample custom configurations. | ||
// | ||
// Requirements: | ||
// - node identity: | ||
// - node GSA with roles/monitoring.metricWriter IAM | ||
// | ||
// - workload identity: | ||
// - e2e-test-metric-writer GSA with roles/monitoring.metricWriter IAM | ||
// - roles/iam.workloadIdentityUser on config-management-monitoring/default for e2e-test-metric-writer | ||
func TestOtelCollectorSampleConfigurations(t *testing.T) { | ||
nt := nomostest.New(t, | ||
nomostesting.Reconciliation1, | ||
ntopts.RequireGKE(t), | ||
ntopts.Unstructured, | ||
) | ||
nt.T.Cleanup(func() { | ||
if t.Failed() { | ||
nt.PodLogs("config-management-monitoring", ocmetrics.OtelCollectorName, "", false) | ||
} | ||
}) | ||
setupMetricsServiceAccount(nt) | ||
|
||
nt.T.Cleanup(func() { | ||
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") | ||
nt.T.Log("Restart otel-collector pod to reset the ConfigMap and log") | ||
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) | ||
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { | ||
nt.T.Errorf("otel-collector pod failed to come up after a restart: %v", err) | ||
} | ||
}) | ||
|
||
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap, log and IAM") | ||
nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) | ||
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
startTime := time.Now().UTC() | ||
ctx := nt.Context | ||
client, err := createGCMClient(ctx) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
nt.T.Log("Add the kustomize components root directory to enable kustomize metrics") | ||
nt.Must(nt.RootRepos[configsync.RootSyncName].Copy("../testdata/hydration/kustomize-components", ".")) | ||
nt.Must(nt.RootRepos[configsync.RootSyncName].CommitAndPush("add DRY configs to the repository")) | ||
|
||
nt.T.Log("Update RootSync to sync from the kustomize-components directory") | ||
rs := fake.RootSyncObjectV1Beta1(configsync.RootSyncName) | ||
nt.MustMergePatch(rs, `{"spec": {"git": {"dir": "kustomize-components"}}}`) | ||
syncDirMap := map[types.NamespacedName]string{ | ||
nomostest.DefaultRootRepoNamespacedName: "kustomize-components", | ||
} | ||
if err := nt.WatchForAllSyncs(nomostest.WithSyncDirectoryMap(syncDirMap)); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
directory := "../../examples/otel-collector-sample-configurations" | ||
dirEntry, err := os.ReadDir(directory) | ||
if err != nil { | ||
nt.T.Fatal("Error opening directory:", err) | ||
} | ||
for _, entry := range dirEntry { | ||
if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".yaml") { | ||
fileName := entry.Name() | ||
fullPath := filepath.Join(directory, fileName) | ||
nt.T.Log("Apply sample custom otel-collector configuration", fileName) | ||
nt.MustKubectl("apply", "-f", fullPath) | ||
|
||
err := nt.Validate(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{}) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
content, err := os.ReadFile(fullPath) | ||
if err != nil { | ||
nt.T.Fatal("failed to read file: %v", err) | ||
} | ||
|
||
var configMap ConfigMap | ||
if err := yaml.Unmarshal(content, &configMap); err != nil { | ||
nt.T.Fatal("error unmarshalling YAML: %v", err) | ||
} | ||
|
||
var otelConfig OtelConfig | ||
if err := yaml.Unmarshal([]byte(configMap.Data.OtelCollectorConfig), &otelConfig); err != nil { | ||
nt.T.Fatal("error: ", err) | ||
} | ||
|
||
// check cloud monitoring | ||
_, err = retry.Retry(60*time.Second, func() error { | ||
includGCM := pipelinesInclude("metrics/cloudmonitoring", otelConfig) | ||
for _, metricType := range GCMMetricTypes { | ||
descriptor := fmt.Sprintf("%s/%s", GCMMetricPrefix, metricType) | ||
it := listMetricInGCM(ctx, nt, client, startTime, descriptor) | ||
if includGCM { | ||
return validateMetricInGCM(nt, it, descriptor, nt.ClusterName) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
return nil | ||
}) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
// check prometheus | ||
if pipelinesInclude("metrics/prometheus", otelConfig) { | ||
summary := testmetrics.Summary{ | ||
Sync: nomostest.RootSyncNN(configsync.RootSyncName), | ||
} | ||
if _, found := nt.RootRepos[summary.Sync.Name]; !found { | ||
nt.T.Fatal("Rootsync not found", configsync.RootSyncName) | ||
} | ||
commitHash, err := nt.RootRepos[summary.Sync.Name].Hash() | ||
if err != nil { | ||
nt.T.Fatal() | ||
} | ||
syncLabels, err := nomostest.MetricLabelsForRootSync(nt, summary.Sync) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
err = nomostest.ValidateMetrics(nt, | ||
nomostest.ReconcilerSyncSuccess(nt, syncLabels, commitHash), | ||
nomostest.ReconcilerErrorMetrics(nt, syncLabels, commitHash, summary.Errors)) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
} | ||
|
||
nt.T.Log("Checking the otel-collector log contains no failure...") | ||
err = validateDeploymentLogHasNoFailure(nt, ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace, MetricExportErrorCaption) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
|
||
nt.T.Log("Remove sample custom otel-collector configuration %v", fileName) | ||
nt.MustKubectl("delete", "cm", ocmetrics.OtelCollectorCustomCM, "-n", configmanagement.MonitoringNamespace, "--ignore-not-found") | ||
err = nt.ValidateNotFoundOrNoMatch(ocmetrics.OtelCollectorCustomCM, configmanagement.MonitoringNamespace, &corev1.ConfigMap{}) | ||
if err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
nt.T.Log("Restart otel-collector pod to refresh the ConfigMap and log") | ||
//nomostest.DeletePodByLabel(nt, "app", ocmetrics.OpenTelemetry, false) | ||
if err := nt.Watcher.WatchForCurrentStatus(kinds.Deployment(), ocmetrics.OtelCollectorName, configmanagement.MonitoringNamespace); err != nil { | ||
nt.T.Fatal(err) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// TestOtelCollectorGCMLabelAggregation validates that Google Cloud Monitoring | ||
// metrics to ensure that the "commit" label is removed through aggregation in | ||
// the otel-collector config. | ||
|
@@ -429,3 +585,30 @@ func validateMetricInGCM(nt *nomostest.NT, it *monitoringv2.TimeSeriesIterator, | |
return fmt.Errorf("GCM metric %s not found (cluster_name=%s)", | ||
metricType, nt.ClusterName) | ||
} | ||
|
||
type ConfigMap struct { | ||
Data struct { | ||
OtelCollectorConfig string `yaml:"otel-collector-config.yaml"` | ||
} `yaml:"data"` | ||
} | ||
|
||
type OtelConfig struct { | ||
Service struct { | ||
Pipelines map[string]Pipeline `yaml:"pipelines"` | ||
} `yaml:"service"` | ||
} | ||
|
||
type Pipeline struct { | ||
Receivers []string `yaml:"receivers"` | ||
Processors []string `yaml:"processors"` | ||
Exporters []string `yaml:"exporters"` | ||
} | ||
|
||
func pipelinesInclude(name string, config OtelConfig) bool { | ||
for pipelineName := range config.Service.Pipelines { | ||
if pipelineName == name { | ||
return true | ||
} | ||
} | ||
return false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Purpose of this directory | ||
|
||
This folder provides sample [custom monitoring configurations](http://cloud/anthos-config-management/docs/how-to/monitor-config-sync-custom) | ||
for Config Sync. These examples are intended for your convenience. While Config | ||
Sync strives to keep them updated, always consult the [latest configuration](https://github.com/GoogleContainerTools/kpt-config-sync/blob/main/pkg/metrics/otel.go). | ||
janetkuo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Available Configurations | ||
|
||
_otel-collector-monarch.yaml_: Serves as a template for exporting metrics | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest bold instead of underline. underline usually implies a link. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or even inline code maybe? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This format is italic, not underline. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which is the usual format when quoting a book name or doc name? I assume. |
||
exclusively to Google Cloud Monarch and Prometheus. | ||
|
||
_otel-collector-prometheus.yaml_: Serves as a template for exporting metrics | ||
exclusively to Prometheus. | ||
|
||
# Instructions | ||
|
||
* **Apply ConfigMap**: Apply the desired ConfigMap to your cluster. | ||
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, execute: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does it know to restart? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The controller looks for a change in annotation and restarts the pod. |
||
``` | ||
kubectl rollout restart deployment otel-collector -n config-management-monitoring | ||
``` | ||
# Removal | ||
|
||
* **Delete ConfigMap**: Remove the ConfigMap from your cluster. | ||
* **Restart otel-collector**: The otel-collector deployment should restart automatically. If it doesn't, use the same kubectl rollout restart command as above. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
# Copyright 2024 Google LLC | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How will we know to update these when we change the gcloud template? Is there any way we can have a linter that produces this from that and then verify it matches without drift? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer we migrate away from the manual templating asap before implementing too much scripts, i.e. having API for monitoring configuration and filtering. These samples exist as a intermediate solution. |
||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
labels: | ||
app: opentelemetry | ||
component: otel-collector | ||
configmanagement.gke.io/arch: csmr | ||
configmanagement.gke.io/system: "true" | ||
name: otel-collector-custom | ||
namespace: config-management-monitoring | ||
data: | ||
otel-collector-config.yaml: |- | ||
receivers: | ||
opencensus: | ||
exporters: | ||
prometheus: | ||
endpoint: :8675 | ||
namespace: config_sync | ||
resource_to_telemetry_conversion: | ||
enabled: true | ||
googlecloud/kubernetes: | ||
metric: | ||
prefix: "kubernetes.io/internal/addons/config_sync/" | ||
# skip_create_descriptor: Metrics start with 'kubernetes.io/' have already | ||
# got descriptors defined internally. Skip sending dupeicated metric | ||
# descriptors here to prevent errors or conflicts. | ||
skip_create_descriptor: true | ||
# instrumentation_library_labels: Otel Collector by default attaches | ||
# 'instrumentation_version' and 'instrumentation_source' labels that are | ||
# not specified in our Cloud Monarch definitions, thus skipping them here | ||
instrumentation_library_labels: false | ||
# create_service_timeseries: This is a recommended configuration for | ||
# 'service metrics' starts with 'kubernetes.io/' prefix. It uses | ||
# CreateTimeSeries API and has its own quotas, so that custom metric write | ||
# will not break this ingestion pipeline | ||
create_service_timeseries: true | ||
service_resource_labels: false | ||
retry_on_failure: | ||
enabled: false | ||
sending_queue: | ||
enabled: false | ||
processors: | ||
batch: | ||
# resourcedetection: This processor is needed to correctly mirror resource | ||
# labels from OpenCensus to OpenTelemetry. We also want to keep this same | ||
# processor in Otel Agent configuration as the resource labels are added from | ||
# there | ||
resourcedetection: | ||
detectors: [env, gcp] | ||
filter/kubernetes: | ||
metrics: | ||
include: | ||
match_type: regexp | ||
metric_names: | ||
- kustomize.* | ||
- api_duration_seconds | ||
- reconciler_errors | ||
- pipeline_error_observed | ||
- reconcile_duration_seconds | ||
- rg_reconcile_duration_seconds | ||
- parser_duration_seconds | ||
- declared_resources | ||
- apply_operations_total | ||
- apply_duration_seconds | ||
- resource_fights_total | ||
- remediate_duration_seconds | ||
- resource_conflicts_total | ||
- internal_errors_total | ||
- rendering_count_total | ||
- skip_rendering_count_total | ||
- resource_override_count_total | ||
- git_sync_depth_override_count_total | ||
- no_ssl_verify_count_total | ||
- kcc_resource_count | ||
- last_sync_timestamp | ||
# Remove custom configsync metric labels that are not registered with Monarch | ||
# This action applies to all metrics that are sent through the pipeline that | ||
# is using this processor | ||
attributes/kubernetes: | ||
actions: | ||
- key: configsync.sync.kind | ||
action: delete | ||
- key: configsync.sync.name | ||
action: delete | ||
- key: configsync.sync.namespace | ||
action: delete | ||
- key: commit | ||
action: delete | ||
metricstransform/kubernetes: | ||
transforms: | ||
- include: declared_resources | ||
action: update | ||
new_name: current_declared_resources | ||
- include: reconciler_errors | ||
action: update | ||
new_name: last_reconciler_errors | ||
- include: pipeline_error_observed | ||
action: update | ||
new_name: last_pipeline_error_observed | ||
- include: apply_operations_total | ||
action: update | ||
new_name: apply_operations_count | ||
- include: resource_fights_total | ||
action: update | ||
new_name: resource_fights_count | ||
- include: resource_conflicts_total | ||
action: update | ||
new_name: resource_conflicts_count | ||
- include: internal_errors_total | ||
action: update | ||
new_name: internal_errors_count | ||
- include: rendering_count_total | ||
action: update | ||
new_name: rendering_count | ||
- include: skip_rendering_count_total | ||
action: update | ||
new_name: skip_rendering_count | ||
- include: resource_override_count_total | ||
action: update | ||
new_name: resource_override_count | ||
- include: git_sync_depth_override_count_total | ||
action: update | ||
new_name: git_sync_depth_override_count | ||
- include: no_ssl_verify_count_total | ||
action: update | ||
new_name: no_ssl_verify_count | ||
extensions: | ||
health_check: | ||
service: | ||
extensions: [health_check] | ||
pipelines: | ||
metrics/prometheus: | ||
receivers: [opencensus] | ||
processors: [batch] | ||
exporters: [prometheus] | ||
metrics/kubernetes: | ||
receivers: [opencensus] | ||
processors: [batch, filter/kubernetes, attributes/kubernetes, metricstransform/kubernetes, resourcedetection] | ||
exporters: [googlecloud/kubernetes] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need this anymore with BYOID, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. Tweaking the setup now, the tests are passing on failure scenarios, still working on it.