diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 5bbec160ed..4d2e5449b5 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -84,17 +84,8 @@ jobs: needs: - collect-integration-tests runs-on: ${{ matrix.job.runner }} - timeout-minutes: 226 # Sum of steps `timeout-minutes` + 5 + timeout-minutes: 216 # Sum of steps `timeout-minutes` + 5 steps: - - name: Free up disk space - timeout-minutes: 10 - run: | - printf '\nDisk usage before cleanup\n' - df --human-readable - # Based on https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 - rm -r /opt/hostedtoolcache/ - printf '\nDisk usage after cleanup\n' - df --human-readable - name: Checkout timeout-minutes: 3 uses: actions/checkout@v5 diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 7b0a1a4646..18b3e1857c 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -341,7 +341,7 @@ def _on_scrape_targets_changed(self, event): import yaml from cosl import JujuTopology from cosl.rules import AlertRules, generic_alert_groups -from ops.charm import CharmBase, RelationJoinedEvent, RelationRole +from ops.charm import CharmBase, RelationRole from ops.framework import ( BoundEvent, EventBase, @@ -350,7 +350,6 @@ def _on_scrape_targets_changed(self, event): ObjectEvents, StoredDict, StoredList, - StoredState, ) from ops.model import Relation @@ -362,7 +361,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 55 +LIBPATCH = 56 # Version 0.0.53 needed for cosl.rules.generic_alert_groups PYDEPS = ["cosl>=0.0.53"] @@ -797,43 +796,6 @@ def __init__( super().__init__(self.message) -def _is_official_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in the upstream format as supported by Prometheus. - - Alert rules in dictionary format are in "official" form if they - contain a "groups" key, since this implies they contain a list of - alert rule groups. - - Args: - rules_dict: a set of alert rules in Python dictionary format - - Returns: - True if alert rules are in official Prometheus file format. - """ - return "groups" in rules_dict - - -def _is_single_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in single rule format. - - The Prometheus charm library supports reading of alert rules in a - custom format that consists of a single alert rule per file. This - does not conform to the official Prometheus alert rule file format - which requires that each alert rules file consists of a list of - alert rule groups and each group consists of a list of alert - rules. - - Alert rules in dictionary form are considered to be in single rule - format if in the least it contains two keys corresponding to the - alert rule name and alert expression. - - Returns: - True if alert rule is in single rule file format. - """ - # one alert rule per file - return set(rules_dict) >= {"alert", "expr"} - - class TargetsChangedEvent(EventBase): """Event emitted when Prometheus scrape targets change.""" @@ -1271,15 +1233,6 @@ def _dedupe_job_names(jobs: List[dict]): return deduped_jobs -def _dedupe_list(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Deduplicate items in the list via object identity.""" - unique_items = [] - for item in items: - if item not in unique_items: - unique_items.append(item) - return unique_items - - def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: """Resolve the provided path items against the directory of the main file. @@ -1711,619 +1664,6 @@ def _update_relation_data(self, _): sort_keys=True, # sort, to prevent unnecessary relation_changed events ) - -class MetricsEndpointAggregator(Object): - """Aggregate metrics from multiple scrape targets. - - `MetricsEndpointAggregator` collects scrape target information from one - or more related charms and forwards this to a `MetricsEndpointConsumer` - charm, which may be in a different Juju model. However, it is - essential that `MetricsEndpointAggregator` itself resides in the same - model as its scrape targets, as this is currently the only way to - ensure in Juju that the `MetricsEndpointAggregator` will be able to - determine the model name and uuid of the scrape targets. - - `MetricsEndpointAggregator` should be used in place of - `MetricsEndpointProvider` in the following two use cases: - - 1. Integrating one or more scrape targets that do not support the - `prometheus_scrape` interface. - - 2. Integrating one or more scrape targets through cross model - relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) - may also be used for the purpose of supporting cross model - relations. - - Using `MetricsEndpointAggregator` to build a Prometheus charm client - only requires instantiating it. Instantiating - `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except - that it requires specifying the names of three relations: the - relation with scrape targets, the relation for alert rules, and - that with the Prometheus charms. For example - - ```python - self._aggregator = MetricsEndpointAggregator( - self, - { - "prometheus": "monitoring", - "scrape_target": "prometheus-target", - "alert_rules": "prometheus-rules" - } - ) - ``` - - `MetricsEndpointAggregator` assumes that each unit of a scrape target - sets in its unit-level relation data two entries with keys - "hostname" and "port". If it is required to integrate with charms - that do not honor these assumptions, it is always possible to - derive from `MetricsEndpointAggregator` overriding the `_get_targets()` - method, which is responsible for aggregating the unit name, host - address ("hostname") and port of the scrape target. - `MetricsEndpointAggregator` also assumes that each unit of a - scrape target sets in its unit-level relation data a key named - "groups". The value of this key is expected to be the string - representation of list of Prometheus Alert rules in YAML format. - An example of a single such alert rule is - - ```yaml - - alert: HighRequestLatency - expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 - for: 10m - labels: - severity: page - annotations: - summary: High request latency - ``` - - Once again if it is required to integrate with charms that do not - honour these assumptions about alert rules then an object derived - from `MetricsEndpointAggregator` may be used by overriding the - `_get_alert_rules()` method. - - `MetricsEndpointAggregator` ensures that Prometheus scrape job - specifications and alert rules are annotated with Juju topology - information, just like `MetricsEndpointProvider` and - `MetricsEndpointConsumer` do. - - By default, `MetricsEndpointAggregator` ensures that Prometheus - "instance" labels refer to Juju topology. This ensures that - instance labels are stable over unit recreation. While it is not - advisable to change this option, if required it can be done by - setting the "relabel_instance" keyword argument to `False` when - constructing an aggregator object. - """ - - _stored = StoredState() - - def __init__( - self, - charm, - relation_names: Optional[dict] = None, - relabel_instance=True, - resolve_addresses=False, - path_to_own_alert_rules: Optional[str] = None, - *, - forward_alert_rules: bool = True, - ): - """Construct a `MetricsEndpointAggregator`. - - Args: - charm: a `CharmBase` object that manages this - `MetricsEndpointAggregator` object. Typically, this is - `self` in the instantiating class. - relation_names: a dictionary with three keys. The value - of the "scrape_target" and "alert_rules" keys are - the relation names over which scrape job and alert rule - information is gathered by this `MetricsEndpointAggregator`. - And the value of the "prometheus" key is the name of - the relation with a `MetricsEndpointConsumer` such as - the Prometheus charm. - relabel_instance: A boolean flag indicating if Prometheus - scrape job "instance" labels must refer to Juju Topology. - resolve_addresses: A boolean flag indiccating if the aggregator - should attempt to perform DNS lookups of targets and append - a `dns_name` label - path_to_own_alert_rules: Optionally supply a path for alert rule files - forward_alert_rules: a boolean flag to toggle forwarding of charmed alert rules - """ - self._charm = charm - - relation_names = relation_names or {} - - self._prometheus_relation = relation_names.get( - "prometheus", "downstream-prometheus-scrape" - ) - self._target_relation = relation_names.get("scrape_target", "prometheus-target") - self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules") - - super().__init__(charm, self._prometheus_relation) - self.topology = JujuTopology.from_charm(charm) - - self._stored.set_default(jobs=[], alert_rules=[]) - - self._relabel_instance = relabel_instance - self._resolve_addresses = resolve_addresses - - self._forward_alert_rules = forward_alert_rules - - # manage Prometheus charm relation events - prometheus_events = self._charm.on[self._prometheus_relation] - self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) - - self.path_to_own_alert_rules = path_to_own_alert_rules - - # manage list of Prometheus scrape jobs from related scrape targets - target_events = self._charm.on[self._target_relation] - self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed) - self.framework.observe( - target_events.relation_departed, self._on_prometheus_targets_departed - ) - - # manage alert rules for Prometheus from related scrape targets - alert_rule_events = self._charm.on[self._alert_rules_relation] - self.framework.observe(alert_rule_events.relation_changed, self._on_alert_rules_changed) - self.framework.observe(alert_rule_events.relation_departed, self._on_alert_rules_departed) - - def _set_prometheus_data(self, event: Optional[RelationJoinedEvent] = None): - """Ensure every new Prometheus instances is updated. - - Any time a new Prometheus unit joins the relation with - `MetricsEndpointAggregator`, that Prometheus unit is provided - with the complete set of existing scrape jobs and alert rules. - """ - if not self._charm.unit.is_leader(): - return - - # Gather the scrape jobs - jobs = [] + _type_convert_stored( - self._stored.jobs # pyright: ignore - ) # list of scrape jobs, one per relation - for relation in self.model.relations[self._target_relation]: - targets = self._get_targets(relation) - if targets and relation.app: - jobs.append(self._static_scrape_job(targets, relation.app.name)) - - # Gather the alert rules - groups = [] + _type_convert_stored( - self._stored.alert_rules # pyright: ignore - ) # list of alert rule groups - for relation in self.model.relations[self._alert_rules_relation]: - unit_rules = self._get_alert_rules(relation) - if unit_rules and relation.app: - appname = relation.app.name - rules = self._label_alert_rules(unit_rules, appname) - group = {"name": self.group_name(appname), "rules": rules} - groups.append(group) - alert_rules = AlertRules(query_type="promql", topology=self.topology) - # Add alert rules from file - if self.path_to_own_alert_rules: - alert_rules.add_path(self.path_to_own_alert_rules, recursive=True) - # Add generic alert rules - alert_rules.add( - copy.deepcopy(generic_alert_groups.application_rules), - group_name_prefix=self.topology.identifier, - ) - groups.extend(alert_rules.as_dict()["groups"]) - - groups = _dedupe_list(groups) - jobs = _dedupe_list(jobs) - - # Set scrape jobs and alert rules in relation data - relations = [event.relation] if event else self.model.relations[self._prometheus_relation] - for rel in relations: - rel.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) # type: ignore - rel.data[self._charm.app]["alert_rules"] = json.dumps( # type: ignore - {"groups": groups if self._forward_alert_rules else []} - ) - - def _on_prometheus_targets_changed(self, event): - """Update scrape jobs in response to scrape target changes. - - When there is any change in relation data with any scrape - target, the Prometheus scrape job, for that specific target is - updated. - """ - targets = self._get_targets(event.relation) - if not targets: - return - - # new scrape job for the relation that has changed - self.set_target_job_data(targets, event.relation.app.name) - - def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: - """Update scrape jobs in response to scrape target changes. - - When there is any change in relation data with any scrape - target, the Prometheus scrape job, for that specific target is - updated. Additionally, if this method is called manually, do the - same. - - Args: - targets: a `dict` containing target information - app_name: a `str` identifying the application - kwargs: a `dict` of the extra arguments passed to the function - """ - if not self._charm.unit.is_leader(): - return - - # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, app_name, **kwargs) - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - # list of scrape jobs that have not changed - jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] - jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore - self._stored.jobs = jobs - - def _on_prometheus_targets_departed(self, event): - """Remove scrape jobs when a target departs. - - Any time a scrape target departs, any Prometheus scrape job - associated with that specific scrape target is removed. - """ - job_name = self._job_name(event.relation.app.name) - unit_name = event.unit.name - self.remove_prometheus_jobs(job_name, unit_name) - - def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): - """Given a job name and unit name, remove scrape jobs associated. - - The `unit_name` parameter is used for automatic, relation data bag-based - generation, where the unit name in labels can be used to ensure that jobs with - similar names (which are generated via the app name when scanning relation data - bags) are not accidentally removed, as their unit name labels will differ. - For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is - sufficient to uniquely identify the target. - """ - if not self._charm.unit.is_leader(): - return - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - if not jobs: - continue - - changed_job = [j for j in jobs if j.get("job_name") == job_name] - if not changed_job: - continue - changed_job = changed_job[0] - - # list of scrape jobs that have not changed - jobs = [job for job in jobs if job.get("job_name") != job_name] - - # list of scrape jobs for units of the same application that still exist - configs_kept = [ - config - for config in changed_job["static_configs"] # type: ignore - if config.get("labels", {}).get("juju_unit") != unit_name - ] - - if configs_kept: - changed_job["static_configs"] = configs_kept # type: ignore - jobs.append(changed_job) - - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - if not _type_convert_stored(self._stored.jobs) == jobs: # pyright: ignore - self._stored.jobs = jobs - - def _job_name(self, appname) -> str: - """Construct a scrape job name. - - Each relation has its own unique scrape job name. All units in - the relation are scraped as part of the same scrape job. - - Args: - appname: string name of a related application. - - Returns: - a string Prometheus scrape job name for the application. - """ - return "juju_{}_{}_{}_prometheus_scrape".format( - self.model.name, self.model.uuid[:7], appname - ) - - def _get_targets(self, relation) -> dict: - """Fetch scrape targets for a relation. - - Scrape target information is returned for each unit in the - relation. This information contains the unit name, network - hostname (or address) for that unit, and port on which a - metrics endpoint is exposed in that unit. - - Args: - relation: an `ops.model.Relation` object for which scrape - targets are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is itself - a dictionary of the form - ``` - {"hostname": hostname, "port": port} - ``` - """ - targets = {} - for unit in relation.units: - if not (unit_databag := relation.data.get(unit)): - continue - if not (hostname := unit_databag.get("hostname")): - continue - port = unit_databag.get("port", 80) - targets.update({unit.name: {"hostname": hostname, "port": port}}) - - return targets - - def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: - """Construct a static scrape job for an application. - - Args: - targets: a dictionary providing hostname and port for all - scrape target. The keys of this dictionary are unit - names. Values corresponding to these keys are - themselves a dictionary with keys "hostname" and - "port". - application_name: a string name of the application for - which this static scrape job is being constructed. - kwargs: a `dict` of the extra arguments passed to the function - - Returns: - A dictionary corresponding to a Prometheus static scrape - job configuration for one application. The returned - dictionary may be transformed into YAML and appended to - the list of any existing list of Prometheus static configs. - """ - juju_model = self.model.name - juju_model_uuid = self.model.uuid - - job = { - "job_name": self._job_name(application_name), - "static_configs": [ - { - "targets": ["{}:{}".format(target["hostname"], target["port"])], - "labels": { - "juju_model": juju_model, - "juju_model_uuid": juju_model_uuid, - "juju_application": application_name, - "juju_unit": unit_name, - "host": target["hostname"], - # Expanding this will merge the dicts and replace the - # topology labels if any were present/found - **self._static_config_extra_labels(target), - }, - } - for unit_name, target in targets.items() - ], - "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), - } - job.update(kwargs.get("updates", {})) - - return job - - def _static_config_extra_labels(self, target: Dict[str, str]) -> Dict[str, str]: - """Build a list of extra static config parameters, if specified.""" - extra_info = {} - - if self._resolve_addresses: - try: - dns_name = socket.gethostbyaddr(target["hostname"])[0] - except OSError: - logger.debug("Could not perform DNS lookup for %s", target["hostname"]) - dns_name = target["hostname"] - extra_info["dns_name"] = dns_name - - return extra_info - - @property - def _relabel_configs(self) -> list: - """Create Juju topology relabeling configuration. - - Using Juju topology for instance labels ensures that these - labels are stable across unit recreation. - - Returns: - a list of Prometheus relabeling configurations. Each item in - this list is one relabel configuration. - """ - return ( - [ - { - "source_labels": [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_unit", - ], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - ] - if self._relabel_instance - else [] - ) - - def _on_alert_rules_changed(self, event): - """Update alert rules in response to scrape target changes. - - When there is any change in alert rule relation data for any - scrape target, the list of alert rules for that specific - target is updated. - """ - unit_rules = self._get_alert_rules(event.relation) - if not unit_rules: - return - - app_name = event.relation.app.name - self.set_alert_rule_data(app_name, unit_rules) - - def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = True) -> None: - """Consolidate incoming alert rules (from stored-state or event) with those from relation data. - - The unit rules should be a dict, which have additional Juju topology labels added. For - rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. - The unit rules are combined with the alert rules from relation data before being written - back to relation data and stored-state. - """ - if not self._charm.unit.is_leader(): - return - - if label_rules: - rules = self._label_alert_rules(unit_rules, name) - else: - rules = [unit_rules] - updated_group = {"name": self.group_name(name), "rules": rules} - - for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) - groups = alert_rules.get("groups", []) - # list of alert rule groups that have not changed - for group in groups: - if group["name"] == updated_group["name"]: - group["rules"] = [r for r in group["rules"] if r not in updated_group["rules"]] - group["rules"].extend(updated_group["rules"]) - - if updated_group["name"] not in [g["name"] for g in groups]: - groups.append(updated_group) - - groups = _dedupe_list(groups) - - relation.data[self._charm.app]["alert_rules"] = json.dumps( - {"groups": groups if self._forward_alert_rules else []} - ) - - if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore - self._stored.alert_rules = groups - - def _on_alert_rules_departed(self, event): - """Remove alert rules for departed targets. - - Any time a scrape target departs any alert rules associated - with that specific scrape target is removed. - """ - group_name = self.group_name(event.relation.app.name) - unit_name = event.unit.name - self.remove_alert_rules(group_name, unit_name) - - def remove_alert_rules(self, group_name: str, unit_name: str) -> None: - """Remove an alert rule group from relation data.""" - if not self._charm.unit.is_leader(): - return - - for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) - if not alert_rules: - continue - - groups = alert_rules.get("groups", []) - if not groups: - continue - - changed_group = [group for group in groups if group["name"] == group_name] - if not changed_group: - continue - changed_group = changed_group[0] - - # list of alert rule groups that have not changed - groups = [group for group in groups if group["name"] != group_name] - - # list of alert rules not associated with departing unit - rules_kept = [ - rule - for rule in changed_group.get("rules") # type: ignore - if rule.get("labels").get("juju_unit") != unit_name - ] - - if rules_kept: - changed_group["rules"] = rules_kept # type: ignore - groups.append(changed_group) - - groups = _dedupe_list(groups) - - relation.data[self._charm.app]["alert_rules"] = json.dumps( - {"groups": groups if self._forward_alert_rules else []} - ) - - if not _type_convert_stored(self._stored.alert_rules) == groups: # pyright: ignore - self._stored.alert_rules = groups - - def _get_alert_rules(self, relation) -> dict: - """Fetch alert rules for a relation. - - Each unit of the related scrape target may have its own - associated alert rules. Alert rules for all units are returned - indexed by unit name. - - Args: - relation: an `ops.model.Relation` object for which alert - rules are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is a list - of alert rules. Each rule is in dictionary format. The - structure "rule dictionary" corresponds to single - Prometheus alert rule. - """ - rules = {} - for unit in relation.units: - if not (unit_databag := relation.data.get(unit)): - continue - if not (unit_rules := yaml.safe_load(unit_databag.get("groups", ""))): - continue - - rules.update({unit.name: unit_rules}) - - return rules - - def group_name(self, unit_name: str) -> str: - """Construct name for an alert rule group. - - Each unit in a relation may define its own alert rules. All - rules, for all units in a relation are grouped together and - given a single alert rule group name. - - Args: - unit_name: string name of a related application. - - Returns: - a string Prometheus alert rules group name for the unit. - """ - unit_name = re.sub(r"/", "_", unit_name) - return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], unit_name) - - def _label_alert_rules(self, unit_rules, app_name: str) -> list: - """Apply juju topology labels to alert rules. - - Args: - unit_rules: a list of alert rules, where each rule is in - dictionary format. - app_name: a string name of the application to which the - alert rules belong. - - Returns: - a list of alert rules with Juju topology labels. - """ - labeled_rules = [] - for unit_name, rules in unit_rules.items(): - for rule in rules: - # the new JujuTopology removed this, so build it up by hand - matchers = { - "juju_{}".format(k): v - for k, v in JujuTopology(self.model.name, self.model.uuid, app_name, unit_name) - .as_dict(excluded_keys=["charm_name"]) - .items() - } - rule["labels"].update(matchers.items()) - labeled_rules.append(rule) - - return labeled_rules - - class CosTool: """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" diff --git a/spread.yaml b/spread.yaml index d80899e77b..be564d141d 100644 --- a/spread.yaml +++ b/spread.yaml @@ -82,6 +82,9 @@ backends: sudo passwd -d runner ADDRESS localhost + + sudo mkdir -p /var/snap/microk8s/common/default-storage + sudo mount --bind /mnt /var/snap/microk8s/common/default-storage # HACK: spread does not pass environment variables set on runner # Manually pass specific environment variables environment: diff --git a/src/charm.py b/src/charm.py index fcff76da90..75c376381b 100755 --- a/src/charm.py +++ b/src/charm.py @@ -651,23 +651,6 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901 event.defer() return - # Restart the workload if it's stuck on the starting state after a timeline divergence - # due to a backup that was restored. - if ( - not self.is_primary - and not self.is_standby_leader - and ( - self._patroni.member_replication_lag == "unknown" - or int(self._patroni.member_replication_lag) > 1000 - ) - ): - logger.warning("Workload failure detected. Reinitialising unit.") - self.unit.status = MaintenanceStatus("reinitialising replica") - self._patroni.reinitialize_postgresql() - logger.debug("Deferring on_peer_relation_changed: reinitialising replica") - event.defer() - return - try: self.postgresql_client_relation.update_read_only_endpoint() except ModelError as e: @@ -1619,9 +1602,6 @@ def _on_update_status(self, _) -> None: ) and not self._was_restore_successful(container, services[0]): return - if self._handle_processes_failures(): - return - # Update the sync-standby endpoint in the async replication data. self.async_replication.update_async_replication_data() @@ -1698,51 +1678,6 @@ def _was_restore_successful(self, container: Container, service: ServiceInfo) -> return True - def _handle_processes_failures(self) -> bool: - """Handle Patroni and PostgreSQL OS processes failures. - - Returns: - a bool indicating whether the charm performed any action. - """ - container = self.unit.get_container("postgresql") - - # Restart the Patroni process if it was killed (in that case, the PostgreSQL - # process is still running). This is needed until - # https://github.com/canonical/pebble/issues/149 is resolved. - if not self._patroni.member_started and self._patroni.is_database_running: - try: - container.restart(self.postgresql_service) - logger.info("restarted Patroni because it was not running") - except ChangeError: - logger.error("failed to restart Patroni after checking that it was not running") - return False - return True - - try: - is_primary = self.is_primary - is_standby_leader = self.is_standby_leader - except RetryError: - return False - - if ( - not is_primary - and not is_standby_leader - and self._patroni.member_started - and not self._patroni.member_streaming - ): - try: - logger.warning("Degraded member detected: reinitialising unit") - self.unit.status = MaintenanceStatus("reinitialising replica") - self._patroni.reinitialize_postgresql() - except RetryError: - logger.error( - "failed to reinitialise replica after checking that it was not streaming from primary" - ) - return False - return True - - return False - @property def _patroni(self): """Returns an instance of the Patroni object.""" diff --git a/src/patroni.py b/src/patroni.py index 0fafb35d0e..c8cd0c7569 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -452,19 +452,6 @@ def member_streaming(self) -> bool: return r.json().get("replication_state") == "streaming" - @property - def is_database_running(self) -> bool: - """Returns whether the PostgreSQL database process is running (and isn't frozen).""" - container = self._charm.unit.get_container("postgresql") - output = container.exec(["ps", "aux"]).wait_output() - postgresql_processes = [ - process - for process in output[0].split("/n") - if "/usr/lib/postgresql/14/bin/postgres" in process - ] - # Check whether the PostgreSQL process has a state equal to T (frozen). - return any(process for process in postgresql_processes if process.split()[7] != "T") - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def bulk_update_parameters_controller_by_patroni(self, parameters: dict[str, Any]) -> None: """Update the value of a parameter controller by Patroni. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index cc04ff19ce..74bd635ac4 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -3,7 +3,6 @@ import itertools import json import logging -from datetime import datetime from unittest import TestCase from unittest.mock import MagicMock, Mock, PropertyMock, call, patch, sentinel @@ -18,7 +17,7 @@ RelationDataTypeError, WaitingStatus, ) -from ops.pebble import Change, ChangeError, ChangeID, ServiceStatus +from ops.pebble import ChangeError, ServiceStatus from ops.testing import Harness from requests import ConnectionError as RequestsConnectionError from tenacity import RetryError, wait_fixed @@ -377,13 +376,10 @@ def test_fail_to_get_primary(harness): def test_on_update_status(harness): with ( patch("charm.logger") as _logger, - patch( - "charm.PostgresqlOperatorCharm._handle_processes_failures" - ) as _handle_processes_failures, patch( "charm.PostgresqlOperatorCharm.enable_disable_extensions" ) as _enable_disable_extensions, - patch("charm.Patroni.member_started") as _member_started, + patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, patch("ops.model.Container.restart") as _restart, @@ -422,24 +418,18 @@ def test_on_update_status(harness): _enable_disable_extensions.assert_called_once() _pebble.get_services.assert_called_once() - # Test when a failure need to be handled. - _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] - _handle_processes_failures.return_value = True - harness.charm.on.update_status.emit() - _get_primary.assert_not_called() - # Test when a failure need to be handled. _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] harness.charm.on.update_status.emit() - _get_primary.assert_not_called() _restart.assert_called_once_with("postgresql") _restart.reset_mock() + _get_primary.assert_called_once_with(unit_name_pattern=True) + _get_primary.reset_mock() # Test restart failed _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] _restart.side_effect = ChangeError(err=None, change=None) harness.charm.on.update_status.emit() - _get_primary.assert_not_called() _restart.assert_called_once_with("postgresql") _logger.exception.assert_called_once_with("Failed to restart patroni") _restart.reset_mock() @@ -448,13 +438,11 @@ def test_on_update_status(harness): # Check primary message not being set (current unit is not the primary). _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] - _handle_processes_failures.return_value = False _get_primary.side_effect = [ "postgresql-k8s/1", harness.charm.unit.name, ] harness.charm.on.update_status.emit() - _get_primary.assert_called_once() assert harness.model.unit.status != ActiveStatus("Primary") # Test again and check primary message being set (current unit is the primary). @@ -476,9 +464,6 @@ def test_on_update_status_no_connection(harness): def test_on_update_status_with_error_on_get_primary(harness): with ( - patch( - "charm.PostgresqlOperatorCharm._handle_processes_failures", return_value=False - ) as _handle_processes_failures, patch("charm.Patroni.member_started") as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, @@ -648,9 +633,6 @@ def test_on_pgdata_storage_detaching(harness): def test_on_update_status_after_restore_operation(harness): with ( patch("charm.PostgresqlOperatorCharm._set_active_status") as _set_active_status, - patch( - "charm.PostgresqlOperatorCharm._handle_processes_failures" - ) as _handle_processes_failures, patch("charm.PostgreSQLBackups.can_use_s3_repository") as _can_use_s3_repository, patch( "single_kernel_postgresql.utils.postgresql.PostgreSQL.get_current_timeline" @@ -677,7 +659,6 @@ def test_on_update_status_after_restore_operation(harness): harness.set_can_connect(POSTGRESQL_CONTAINER, True) harness.charm.on.update_status.emit() _update_config.assert_not_called() - _handle_processes_failures.assert_not_called() _set_active_status.assert_not_called() assert isinstance(harness.charm.unit.status, BlockedStatus) @@ -687,7 +668,6 @@ def test_on_update_status_after_restore_operation(harness): _member_started.return_value = False harness.charm.on.update_status.emit() _update_config.assert_not_called() - _handle_processes_failures.assert_not_called() _set_active_status.assert_not_called() tc.assertIsInstance(harness.charm.unit.status, ActiveStatus) @@ -700,10 +680,8 @@ def test_on_update_status_after_restore_operation(harness): # Test when the restore operation finished successfully. _member_started.return_value = True _can_use_s3_repository.return_value = (True, None) - _handle_processes_failures.return_value = False harness.charm.on.update_status.emit() _update_config.assert_called_once() - _handle_processes_failures.assert_called_once() _set_active_status.assert_called_once() assert isinstance(harness.charm.unit.status, ActiveStatus) @@ -1323,99 +1301,6 @@ def test_on_peer_relation_changed(harness): _set_active_status.assert_not_called() -def test_handle_processes_failures(harness): - with ( - patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql, - patch("charm.Patroni.member_streaming", new_callable=PropertyMock) as _member_streaming, - patch( - "charm.PostgresqlOperatorCharm.is_standby_leader", new_callable=PropertyMock - ) as _is_standby_leader, - patch( - "charm.PostgresqlOperatorCharm.is_primary", new_callable=PropertyMock - ) as _is_primary, - patch( - "charm.Patroni.is_database_running", new_callable=PropertyMock - ) as _is_database_running, - patch("charm.Patroni.member_started", new_callable=PropertyMock) as _member_started, - patch("ops.model.Container.restart") as _restart, - ): - # Test when there are no processes failures to handle. - harness.set_can_connect(POSTGRESQL_CONTAINER, True) - for values in itertools.product( - [True, False], [True, False], [True, False], [True, False], [True, False] - ): - # Skip conditions that lead to handling a process failure. - if (not values[0] and values[2]) or (not values[3] and values[1] and not values[4]): - continue - - _member_started.side_effect = [values[0], values[1]] - _is_database_running.return_value = values[2] - _is_primary.return_value = values[3] - _member_streaming.return_value = values[4] - assert not harness.charm._handle_processes_failures() - _restart.assert_not_called() - _reinitialize_postgresql.assert_not_called() - - # Test when the Patroni process is not running. - _is_database_running.return_value = True - for values in itertools.product( - [ - None, - ChangeError( - err="fake error", - change=Change( - ChangeID("1"), - "fake kind", - "fake summary", - "fake status", - [], - True, - "fake error", - datetime.now(), - datetime.now(), - ), - ), - ], - [True, False], - [True, False], - [True, False], - ): - _restart.reset_mock() - _restart.side_effect = values[0] - _is_primary.return_value = values[1] - _member_started.side_effect = [False, values[2]] - _member_streaming.return_value = values[3] - harness.charm.unit.status = ActiveStatus() - result = harness.charm._handle_processes_failures() - assert result == (values[0] is None) - assert isinstance(harness.charm.unit.status, ActiveStatus) - _restart.assert_called_once_with("postgresql") - _reinitialize_postgresql.assert_not_called() - - # Test when the unit is a replica and it's not streaming from primary. - _restart.reset_mock() - _is_primary.return_value = False - _is_standby_leader.return_value = False - _member_streaming.return_value = False - for values in itertools.product( - [None, RetryError(last_attempt=1)], [True, False], [True, False] - ): - # Skip the condition that lead to handling other process failure. - if not values[1] and values[2]: - continue - - _reinitialize_postgresql.reset_mock() - _reinitialize_postgresql.side_effect = values[0] - _member_started.side_effect = [values[1], True] - _is_database_running.return_value = values[2] - harness.charm.unit.status = ActiveStatus() - result = harness.charm._handle_processes_failures() - assert result == (values[0] is None) - assert isinstance(harness.charm.unit.status, MaintenanceStatus) - _restart.assert_not_called() - _reinitialize_postgresql.assert_called_once() - - def test_push_ca_file_into_workload(harness): with ( patch("charm.PostgresqlOperatorCharm.update_config") as _update_config,