Skip to content

Commit

Permalink
Add absent_over_time for command_status (#117)
Browse files Browse the repository at this point in the history
* Add absent_over_time to expr
* Add manual tests
* fetch-lib

---------

Co-authored-by: Luca Bello <36242061+lucabello@users.noreply.github.com>
  • Loading branch information
sed-i and lucabello authored Feb 6, 2024
1 parent e4d7244 commit 7fce85d
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 29 deletions.
22 changes: 13 additions & 9 deletions lib/charms/nrpe_exporter/v0/nrpe_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -453,21 +453,25 @@ def _generate_data(self, relation) -> Tuple[list, list]:

def _generate_alert(self, relation, cmd, id, unit) -> dict:
"""Generate an on-the-fly Alert rule."""
pattern = r"^(.*?)[-_](\d+)$"
if match := re.match(pattern, id.replace("_", "-")):
app_name, unit_num = match.groups()
else:
raise ValueError(f"Invalid unit identifier '{id}': expected a string like 'unit-0'")

unit_label = f"{app_name}/{unit_num}"
return {
"alert": "{}NrpeAlert".format("".join([x.title() for x in cmd.split("_")])),
# Average over 5 minutes considering a 60-second scrape interval
"expr": "avg_over_time(command_status{{juju_unit='{}',command='{}'}}[15m]) > 1".format(
re.sub(r"^(.*?)[-_](\d+)$", r"\1/\2", id.replace("_", "-")), cmd
)
+ " or (absent_over_time(up{{juju_unit='{}'}}[10m]) == 1)".format(
re.sub(r"^(.*?)[-_](\d+)$", r"\1/\2", id.replace("_", "-")),
),
"expr": f"avg_over_time(command_status{{juju_unit='{unit_label}',command='{cmd}'}}[15m]) > 1"
+ f" or (absent_over_time(command_status{{juju_unit='{unit_label}',command='{cmd}'}}[10m]) == 1)"
+ f" or (absent_over_time(up{{juju_unit='{unit_label}'}}[10m]) == 1)",
"for": "0m",
"labels": {
"severity": "{{ if eq $value 0.0 -}} info {{- else if eq $value 1.0 -}} warning {{- else if eq $value 2.0 -}} critical {{- else if eq $value 3.0 -}} error {{- end }}",
"juju_model": self.model.name,
"juju_application": re.sub(r"^(.*?)[-_]\d+$", r"\1", id.replace("_", "-")),
"juju_unit": re.sub(r"^(.*?)[-_](\d+)$", r"\1/\2", id.replace("_", "-")),
"juju_application": app_name,
"juju_unit": unit_label,
"nrpe_application": relation.app.name,
"nrpe_unit": unit.name,
},
Expand Down
16 changes: 8 additions & 8 deletions lib/charms/operator_libs_linux/v0/apt.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
import subprocess
from collections.abc import Mapping
from enum import Enum
from subprocess import PIPE, CalledProcessError, check_call, check_output
from subprocess import PIPE, CalledProcessError, check_output
from typing import Iterable, List, Optional, Tuple, Union
from urllib.parse import urlparse

Expand All @@ -122,7 +122,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 11
LIBPATCH = 13


VALID_SOURCE_TYPES = ("deb", "deb-src")
Expand Down Expand Up @@ -250,10 +250,10 @@ def _apt(
try:
env = os.environ.copy()
env["DEBIAN_FRONTEND"] = "noninteractive"
check_call(_cmd, env=env, stderr=PIPE, stdout=PIPE)
subprocess.run(_cmd, capture_output=True, check=True, text=True, env=env)
except CalledProcessError as e:
raise PackageError(
"Could not {} package(s) [{}]: {}".format(command, [*package_names], e.output)
"Could not {} package(s) [{}]: {}".format(command, [*package_names], e.stderr)
) from None

def _add(self) -> None:
Expand Down Expand Up @@ -476,7 +476,7 @@ def from_apt_cache(
)
except CalledProcessError as e:
raise PackageError(
"Could not list packages in apt-cache: {}".format(e.output)
"Could not list packages in apt-cache: {}".format(e.stderr)
) from None

pkg_groups = output.strip().split("\n\n")
Expand Down Expand Up @@ -748,7 +748,7 @@ def add_package(

packages = {"success": [], "retry": [], "failed": []}

package_names = [package_names] if type(package_names) is str else package_names
package_names = [package_names] if isinstance(package_names, str) else package_names
if not package_names:
raise TypeError("Expected at least one package name to add, received zero!")

Expand Down Expand Up @@ -818,7 +818,7 @@ def remove_package(
"""
packages = []

package_names = [package_names] if type(package_names) is str else package_names
package_names = [package_names] if isinstance(package_names, str) else package_names
if not package_names:
raise TypeError("Expected at least one package name to add, received zero!")

Expand All @@ -837,7 +837,7 @@ def remove_package(

def update() -> None:
"""Update the apt cache via `apt-get update`."""
check_call(["apt-get", "update"], stderr=PIPE, stdout=PIPE)
subprocess.run(["apt-get", "update"], capture_output=True, check=True)


def import_key(key: str) -> str:
Expand Down
28 changes: 17 additions & 11 deletions lib/charms/prometheus_k8s/v0/prometheus_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def _on_scrape_targets_changed(self, event):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 42
LIBPATCH = 44

PYDEPS = ["cosl"]

Expand All @@ -386,6 +386,7 @@ def _on_scrape_targets_changed(self, event):
"basic_auth",
"tls_config",
"authorization",
"params",
}
DEFAULT_JOB = {
"metrics_path": "/metrics",
Expand Down Expand Up @@ -764,7 +765,7 @@ def _validate_relation_by_interface_and_direction(
actual_relation_interface = relation.interface_name
if actual_relation_interface != expected_relation_interface:
raise RelationInterfaceMismatchError(
relation_name, expected_relation_interface, actual_relation_interface
relation_name, expected_relation_interface, actual_relation_interface or "None"
)

if expected_relation_role == RelationRole.provides:
Expand Down Expand Up @@ -857,7 +858,7 @@ class MonitoringEvents(ObjectEvents):
class MetricsEndpointConsumer(Object):
"""A Prometheus based Monitoring service."""

on = MonitoringEvents()
on = MonitoringEvents() # pyright: ignore

def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME):
"""A Prometheus based Monitoring service.
Expand Down Expand Up @@ -1014,7 +1015,6 @@ def alerts(self) -> dict:
try:
scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"])
identifier = JujuTopology.from_dict(scrape_metadata).identifier
alerts[identifier] = self._tool.apply_label_matchers(alert_rules) # type: ignore

except KeyError as e:
logger.debug(
Expand All @@ -1029,6 +1029,10 @@ def alerts(self) -> dict:
)
continue

# We need to append the relation info to the identifier. This is to allow for cases for there are two
# relations which eventually scrape the same application. Issue #551.
identifier = f"{identifier}_{relation.name}_{relation.id}"

alerts[identifier] = alert_rules

_, errmsg = self._tool.validate_alert_rules(alert_rules)
Expand Down Expand Up @@ -1294,7 +1298,7 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st
class MetricsEndpointProvider(Object):
"""A metrics endpoint for Prometheus."""

on = MetricsEndpointProviderEvents()
on = MetricsEndpointProviderEvents() # pyright: ignore

def __init__(
self,
Expand Down Expand Up @@ -1836,14 +1840,16 @@ def _set_prometheus_data(self, event):
return

jobs = [] + _type_convert_stored(
self._stored.jobs
self._stored.jobs # pyright: ignore
) # list of scrape jobs, one per relation
for relation in self.model.relations[self._target_relation]:
targets = self._get_targets(relation)
if targets and relation.app:
jobs.append(self._static_scrape_job(targets, relation.app.name))

groups = [] + _type_convert_stored(self._stored.alert_rules) # list of alert rule groups
groups = [] + _type_convert_stored(
self._stored.alert_rules # pyright: ignore
) # list of alert rule groups
for relation in self.model.relations[self._alert_rules_relation]:
unit_rules = self._get_alert_rules(relation)
if unit_rules and relation.app:
Expand Down Expand Up @@ -1895,7 +1901,7 @@ def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None:
jobs.append(updated_job)
relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)

if not _type_convert_stored(self._stored.jobs) == jobs:
if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore
self._stored.jobs = jobs

def _on_prometheus_targets_departed(self, event):
Expand Down Expand Up @@ -1947,7 +1953,7 @@ def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""):

relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs)

if not _type_convert_stored(self._stored.jobs) == jobs:
if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore
self._stored.jobs = jobs

def _job_name(self, appname) -> str:
Expand Down Expand Up @@ -2126,7 +2132,7 @@ def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = T
groups.append(updated_group)
relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups})

if not _type_convert_stored(self._stored.alert_rules) == groups:
if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore
self._stored.alert_rules = groups

def _on_alert_rules_departed(self, event):
Expand Down Expand Up @@ -2176,7 +2182,7 @@ def remove_alert_rules(self, group_name: str, unit_name: str) -> None:
json.dumps({"groups": groups}) if groups else "{}"
)

if not _type_convert_stored(self._stored.alert_rules) == groups:
if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore
self._stored.alert_rules = groups

def _get_alert_rules(self, relation) -> dict:
Expand Down
29 changes: 29 additions & 0 deletions tests/manual/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Test NRPE
### Deploy
First deploy the [`nrpe-k8s-bundle`](nrpe-k8s-bundle.yaml) in a k8s model, and then deploy the
[`nrpe-lxd-bundle`](nrpe-lxd-bundle.yaml) in a lxd model.

```mermaid
graph LR
subgraph lxd
ububntu --- nrpe --- cos-proxy
end
subgraph k8s
prometheus
end
cos-proxy ---prometheus
```

### Verify
- Make sure rule files are available in prometheus:
- relation data: `juju show-unit prom/0`
- on disk: `juju ssh --container prometheus prom/0 ls /etc/prometheus/rules`
- via http api: `curl x.x.x.x:9090/api/v1/rules | jq`
- Stop the nrpe-exporter service, `juju ssh cp/0 sudo systemctl stop nrpe-exporter`, and make sure:
- All nrpe related targets are down, `curl x.x.x.x:9090/api/v1/targets | jq`
- 10 minutes after stopping, alerts are firing:
`curl x.x.x.x:9090/api/v1/rules | jq '.data.groups | .[].rules | .[] | select(.state == "firing")'`
- Start the nrpe exporter, `juju ssh cp/0 sudo systemctl start nrpe-exporter`, and make sure
the alerts are not firing anymore (need to wait for the scrape interval, 1m, to elapse).
19 changes: 19 additions & 0 deletions tests/manual/nrpe-k8s-bundle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
bundle: kubernetes
applications:
prom:
charm: prometheus-k8s
channel: edge
revision: 170
scale: 1
trust: true

--- # overlay.yaml
applications:
prom:
offers:
prom:
endpoints:
- metrics-endpoint
- receive-remote-write
acl:
admin: admin
33 changes: 33 additions & 0 deletions tests/manual/nrpe-lxd-bundle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
series: jammy

saas:
prom:
url: k8s:admin/cos.prom

applications:
cp:
# charm path is relative to the bundle file
charm: ../../cos-proxy_ubuntu-20.04-amd64_ubuntu-22.04-amd64.charm
series: focal
num_units: 1
nrpe:
charm: nrpe
channel: edge
ub:
charm: ubuntu
channel: edge
num_units: 1

relations:

# NRPE forwards names of checks (e.g. "check_conntrack") over the monitors relation
- - nrpe:monitors
- cp:monitors

# NRPE is a subordinate charm, so we use "ubuntu" as a stand-in principal
- - ub:juju-info
- nrpe:general-info

# cos-proxy generated alert rules on-the-fly from the checks it got from the "monitors" relation.
- - cp:downstream-prometheus-scrape
- prom:metrics-endpoint
3 changes: 2 additions & 1 deletion tests/scenario/test_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def test_relation(ctx, n_remote_units):
"monitors",
remote_app_name="remote",
remote_units_data={
i: {"monitors": yaml.safe_dump(monitors_raw)} for i in range(n_remote_units)
i: {"monitors": yaml.safe_dump(monitors_raw), "target-id": "juju-ubuntu-0"}
for i in range(n_remote_units)
},
)
state_in = State(leader=True, relations=[monitors], networks=[Network.default("monitors")])
Expand Down

0 comments on commit 7fce85d

Please sign in to comment.