Skip to content

Commit

Permalink
add prometheus scrape jobs for all units (#264)
Browse files Browse the repository at this point in the history
updates the scrape job generated by charm.py to include all units, not just the leader, to enable prometheus to scrape metrics from all alertmanager units.  Also clarifies the terminology used when talking about netlocs, hostnames, etc.
  • Loading branch information
ca-scribner authored Jun 26, 2024
1 parent 9391fc3 commit 71c422f
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 42 deletions.
14 changes: 6 additions & 8 deletions src/alertmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def __init__(
charm,
*,
container_name: str,
peer_addresses: List[str],
peer_netlocs: List[str],
api_port: int,
ha_port: int,
web_external_url: str,
Expand All @@ -96,7 +96,7 @@ def __init__(
self._service_name = self._container_name = container_name
self._container = charm.unit.get_container(container_name)

self._peer_addresses = peer_addresses
self._peer_netlocs = peer_netlocs

self._api_port = api_port
self._ha_port = ha_port
Expand Down Expand Up @@ -169,17 +169,15 @@ def _alertmanager_layer(self) -> Layer:

def _command():
"""Returns full command line to start alertmanager."""
# cluster listen address - empty string disables HA mode
listen_address_arg = (
"" if len(self._peer_addresses) == 0 else f"0.0.0.0:{self._ha_port}"
)
# cluster listen netloc - empty string disables HA mode
listen_netloc_arg = "" if len(self._peer_netlocs) == 0 else f"0.0.0.0:{self._ha_port}"

# The chosen port in the cluster.listen-address flag is the port that needs to be
# specified in the cluster.peer flag of the other peers.
# Assuming all replicas use the same port.
# Sorting for repeatability in comparing between service layers.
peer_cmd_args = " ".join(
sorted([f"--cluster.peer={address}" for address in self._peer_addresses])
sorted([f"--cluster.peer={netloc}" for netloc in self._peer_netlocs])
)
web_config_arg = (
f"--web.config.file={self._web_config_path} " if self._is_tls_enabled() else ""
Expand All @@ -189,7 +187,7 @@ def _command():
f"--config.file={self._config_path} "
f"--storage.path={self._storage_path} "
f"--web.listen-address=:{self._api_port} "
f"--cluster.listen-address={listen_address_arg} "
f"--cluster.listen-address={listen_netloc_arg} "
f"--web.external-url={self._web_external_url} "
f"{web_config_arg}"
f"{peer_cmd_args}"
Expand Down
68 changes: 34 additions & 34 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,15 @@ def __init__(self, *args):
# Core lifecycle events
self.framework.observe(self.on.config_changed, self._on_config_changed)

peer_ha_netlocs = [
f"{hostname}:{self._ports.ha}"
for hostname in self._get_peer_hostnames(include_this_unit=False)
]

self.alertmanager_workload = WorkloadManager(
self,
container_name=self._container_name,
peer_addresses=self._get_peer_addresses(),
peer_netlocs=peer_ha_netlocs,
api_port=self.api_port,
ha_port=self._ports.ha,
web_external_url=self._internal_url,
Expand Down Expand Up @@ -240,16 +245,15 @@ def self_scraping_job(self):
# This assumption is necessary because the local CA signs CSRs with FQDN as the SAN DNS.
# If prometheus were to scrape an ingress URL instead, it would error out with:
# x509: cannot validate certificate.
metrics_endpoint = urlparse(self._internal_url.rstrip("/") + "/metrics")
metrics_path = metrics_endpoint.path
# Render a ':port' section only if it is explicit (e.g. 9093; without an explicit port, the
# port is deduced from the scheme).
port_str = (":" + str(metrics_endpoint.port)) if metrics_endpoint.port is not None else ""
target = f"{metrics_endpoint.hostname}{port_str}"
peer_api_netlocs = [
f"{hostname}:{self._ports.api}"
for hostname in self._get_peer_hostnames(include_this_unit=True)
]

config = {
"scheme": metrics_endpoint.scheme,
"metrics_path": metrics_path,
"static_configs": [{"targets": [target]}],
"scheme": self._scheme,
"metrics_path": "/metrics",
"static_configs": [{"targets": peer_api_netlocs}],
}

return [config]
Expand Down Expand Up @@ -526,29 +530,25 @@ def _update_ca_certs(self):
ca_cert_path.unlink(missing_ok=True)
subprocess.run(["update-ca-certificates", "--fresh"], check=True)

def _get_peer_addresses(self) -> List[str]:
"""Create a list of HA addresses of all peer units (all units excluding current).
def _get_peer_hostnames(self, include_this_unit=True) -> List[str]:
"""Returns a list of the hostnames of the peer units.
The returned addresses include the hostname, HA port number and path, but do not include
scheme (http).
An example of the return format is:
["alertmanager-1.alertmanager-endpoints.am.svc.cluster.local"]
"""
addresses = []
if include_this_unit:
addresses.append(self._internal_url)
if pr := self.peer_relation:
for unit in pr.units: # pr.units only holds peers (self.unit is not included)
if api_url := pr.data[unit].get("private_address"):
parsed = urlparse(api_url)
if not (parsed.scheme in ["http", "https"] and parsed.hostname):
# This shouldn't happen
logger.error(
"Invalid peer address in relation data: '%s'; skipping. "
"Address must include scheme (http or https) and hostname.",
api_url,
)
continue
# Drop scheme and replace API port with HA port
addresses.append(f"{parsed.hostname}:{self._ports.ha}{parsed.path}")

return addresses
if address := pr.data[unit].get("private_address"):
addresses.append(address)

# Save only the hostname part of the address
# Sort the hostnames in case their order is not guaranteed, to reduce unnecessary updates
hostnames = sorted([urlparse(address).hostname for address in addresses])

return hostnames

def _is_tls_ready(self) -> bool:
"""Returns True if the workload is ready to operate in TLS mode."""
Expand All @@ -561,18 +561,18 @@ def _is_tls_ready(self) -> bool:

@property
def _internal_url(self) -> str:
"""Return the fqdn dns-based in-cluster (private) address of the alertmanager api server.
If an external (public) url is set, add in its path.
"""
scheme = "https" if self._is_tls_ready() else "http"
return f"{scheme}://{socket.getfqdn()}:{self._ports.api}"
"""Return the fqdn dns-based in-cluster (private) address of the alertmanager api server."""
return f"{self._scheme}://{socket.getfqdn()}:{self._ports.api}"

@property
def _external_url(self) -> str:
"""Return the externally-reachable (public) address of the alertmanager api server."""
return self.ingress.url or self._internal_url

@property
def _scheme(self) -> str:
return "https" if self._is_tls_ready() else "http"

@property
def tracing_endpoint(self) -> Optional[str]:
"""Otlp http endpoint for charm instrumentation."""
Expand Down
81 changes: 81 additions & 0 deletions tests/unit/test_self_scrape_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3
# Copyright 2021 Canonical Ltd.
# See LICENSE file for licensing details.
import unittest
from unittest.mock import PropertyMock, patch

from alertmanager import WorkloadManager
from charm import AlertmanagerCharm
from helpers import k8s_resource_multipatch
from ops.testing import Harness


class TestWithInitialHooks(unittest.TestCase):
container_name: str = "alertmanager"

@patch("lightkube.core.client.GenericSyncClient")
@patch.object(WorkloadManager, "check_config", lambda *a, **kw: ("ok", ""))
@k8s_resource_multipatch
@patch.object(WorkloadManager, "_alertmanager_version", property(lambda *_: "0.0.0"))
def setUp(self, *unused):
self.harness = Harness(AlertmanagerCharm)
self.addCleanup(self.harness.cleanup)

self.harness.set_leader(True)
self.app_name = "am"
# Create the peer relation before running harness.begin_with_initial_hooks(), because
# otherwise it will create it for you and we don't know the rel_id
self.peer_rel_id = self.harness.add_relation("replicas", self.app_name)

self.harness.begin_with_initial_hooks()

@patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock)
@patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock)
def test_self_scraping_job_with_no_peers(self, _mock_scheme, _mock_internal_url):
scheme = "https"
_mock_scheme.return_value = scheme
url_no_scheme = f"test-internal.url:{self.harness.charm._ports.api}"
_mock_internal_url.return_value = f"{scheme}://{url_no_scheme}"
jobs_expected = [
{
"metrics_path": "/metrics",
"scheme": scheme,
"static_configs": [{"targets": [url_no_scheme]}],
}
]

jobs = self.harness.charm.self_scraping_job
self.assertEqual(jobs, jobs_expected)

@patch.object(AlertmanagerCharm, "_internal_url", new_callable=PropertyMock)
@patch.object(AlertmanagerCharm, "_scheme", new_callable=PropertyMock)
def test_self_scraping_job_with_peers(self, _mock_scheme, _mock_internal_url):
scheme = "https"
_mock_scheme.return_value = scheme

targets = [
f"test-internal-0.url:{self.harness.charm._ports.api}",
f"test-internal-1.url:{self.harness.charm._ports.api}",
f"test-internal-2.url:{self.harness.charm._ports.api}",
]
metrics_path = "/metrics"
_mock_internal_url.return_value = f"{scheme}://{targets[0]}"

jobs_expected = [
{
"metrics_path": metrics_path,
"scheme": scheme,
"static_configs": [{"targets": targets}],
}
]

# Add peers
for i, target in enumerate(targets[1:], 1):
unit_name = f"{self.app_name}/{i}"
self.harness.add_relation_unit(self.peer_rel_id, unit_name)
self.harness.update_relation_data(
self.peer_rel_id, unit_name, {"private_address": f"{scheme}://{target}"}
)

jobs = self.harness.charm.self_scraping_job
self.assertEqual(jobs_expected, jobs)

0 comments on commit 71c422f

Please sign in to comment.