Skip to content

Commit

Permalink
fix(infra): remove high cardinality prometheus metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanFlurry authored and MasterPtato committed May 31, 2024
1 parent 8ee4366 commit dceb970
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 19 deletions.
2 changes: 1 addition & 1 deletion infra/tf/vector/vector.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ resource "helm_release" "vector" {
namespace = "vector"
repository = "https://helm.vector.dev"
chart = "vector"
version = "0.29.0"
version = "0.38.0"
values = [yamlencode({
role = "Aggregator"
podPriorityClassName = "service-priority"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ pub async fn gg_static_config() -> GlobalResult<String> {
port = TUNNEL_API_INTERNAL_PORT,
);

// Metrics are disabled since they're too high cardinality for Prometheus (both the # of
// entrypoint & the frequently changing routers + services)
let mut config = formatdoc!(
r#"
[entryPoints]
Expand All @@ -239,13 +241,6 @@ pub async fn gg_static_config() -> GlobalResult<String> {
[api]
insecure = true
[metrics.prometheus]
# See lib/chirp/metrics/src/buckets.rs
buckets = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0]
addEntryPointsLabels = true
addRoutersLabels = true
addServicesLabels = true
[providers]
[providers.file]
directory = "/etc/game_guard/dynamic"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ pub fn configure(config: &Config, pool_type: backend::cluster::PoolType) -> Stri
type = "remap"
inputs = [{sources}]
source = '''
# Drop go stats
if starts_with!(.name, "go_") {{
abort
}}
.tags.server_id = "___SERVER_ID___"
.tags.datacenter_id = "___DATACENTER_ID___"
.tags.cluster_id = "___CLUSTER_ID___"
Expand All @@ -55,6 +60,11 @@ pub fn configure(config: &Config, pool_type: backend::cluster::PoolType) -> Stri
address = "127.0.0.1:{TUNNEL_VECTOR_PORT}"
healthcheck.enabled = false
compression = true
# Buffer to disk for durability & reduce memory usage
buffer.max_events = 500
buffer.max_size = 268435488
buffer.type = "disk"
"#
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ After=network-online.target
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/bin/node_exporter --collector.cgroups --collector.network_route --collector.systemd
# Reduce cardinality
ExecStart=/usr/bin/node_exporter --collector.disable-defaults --collector.cpu --collector.conntrack --collector.meminfo --collector.filesystem --collector.filesystem.mount-points-exclude=^/opt/nomad/
Restart=always
RestartSec=2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,9 @@ telemetry {
collection_interval = "5s"
disable_hostname = true
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
# Don't publish_allocation_metrics because of high cardinality
}
# Needed for Prometheus rate limiting
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version="0.34.1"
version="0.38.0"

# Create vector user
if ! id -u "vector" &>/dev/null; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,6 @@ pub async fn gen_initialize(pool_type: backend::cluster::PoolType) -> GlobalResu
tcp_server_transports: Default::default(),
},
));

prometheus_targets.insert(
GG_TRAEFIK_INSTANCE_NAME.into(),
components::vector::PrometheusTarget {
endpoint: "http://127.0.0.1:9980/metrics".into(),
scrape_interval: 15,
},
);
}
backend::cluster::PoolType::Ats => {
script.push(components::traffic_server::configure().await?);
Expand Down

0 comments on commit dceb970

Please sign in to comment.