diff --git a/actions.yaml b/actions.yaml index b3e3c24abb..4f112dda58 100644 --- a/actions.yaml +++ b/actions.yaml @@ -34,8 +34,13 @@ list-backups: pre-upgrade-check: description: Run necessary pre-upgrade checks and preparations before executing a charm refresh. promote-to-primary: - description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit. + description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster + or against the unit to be promoted within the cluster. params: + scope: + type: string + default: cluster + description: Whether to promote a unit or a cluster. Must be set to either unit or cluster. force: type: boolean description: Force the promotion of a cluster when there is already a primary cluster. diff --git a/src/charm.py b/src/charm.py index fa21a1ec7b..de0e27a07c 100755 --- a/src/charm.py +++ b/src/charm.py @@ -112,7 +112,7 @@ WORKLOAD_OS_GROUP, WORKLOAD_OS_USER, ) -from patroni import NotReadyError, Patroni, SwitchoverFailedError +from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError from relations.async_replication import ( REPLICATION_CONSUMER_RELATION, REPLICATION_OFFER_RELATION, @@ -211,6 +211,7 @@ def __init__(self, *args): self.framework.observe(self.on.stop, self._on_stop) self.framework.observe(self.on.get_password_action, self._on_get_password) self.framework.observe(self.on.set_password_action, self._on_set_password) + self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary) self.framework.observe(self.on.get_primary_action, self._on_get_primary) self.framework.observe(self.on.update_status, self._on_update_status) self._storage_path = self.meta.storages["pgdata"].location @@ -1305,6 +1306,26 @@ def _on_set_password(self, event: ActionEvent) -> None: event.set_results({"password": password}) + def _on_promote_to_primary(self, event: ActionEvent) -> None: + if event.params.get("scope") == "cluster": + return self.async_replication.promote_to_primary(event) + elif event.params.get("scope") == "unit": + return self.promote_primary_unit(event) + else: + event.fail("Scope should be either cluster or unit") + + def promote_primary_unit(self, event: ActionEvent) -> None: + """Handles promote to primary for unit scope.""" + if event.params.get("force"): + event.fail("Suprerfluous force flag with unit scope") + else: + try: + self._patroni.switchover(self.unit.name, wait=False) + except SwitchoverNotSyncError: + event.fail("Unit is not sync standby") + except SwitchoverFailedError: + event.fail("Switchover failed or timed out, check the logs for details") + def _on_get_primary(self, event: ActionEvent) -> None: """Get primary instance.""" try: diff --git a/src/patroni.py b/src/patroni.py index 342d97f4a6..ac9d64b407 100644 --- a/src/patroni.py +++ b/src/patroni.py @@ -53,6 +53,10 @@ class SwitchoverFailedError(Exception): """Raised when a switchover failed for some reason.""" +class SwitchoverNotSyncError(SwitchoverFailedError): + """Raised when a switchover failed because node is not sync.""" + + class UpdateSyncNodeCountError(Exception): """Raised when updating synchronous_node_count failed for some reason.""" @@ -612,7 +616,7 @@ def restart_postgresql(self) -> None: timeout=PATRONI_TIMEOUT, ) - def switchover(self, candidate: str | None = None) -> None: + def switchover(self, candidate: str | None = None, wait: bool = True) -> None: """Trigger a switchover.""" # Try to trigger the switchover. if candidate is not None: @@ -631,8 +635,18 @@ def switchover(self, candidate: str | None = None) -> None: # Check whether the switchover was unsuccessful. if r.status_code != 200: + if ( + r.status_code == 412 + and r.text == "candidate name does not match with sync_standby" + ): + logger.debug("Unit is not sync standby") + raise SwitchoverNotSyncError() + logger.warning(f"Switchover call failed with code {r.status_code} {r.text}") raise SwitchoverFailedError(f"received {r.status_code}") + if not wait: + return + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True): with attempt: new_primary = self.get_primary() diff --git a/src/relations/async_replication.py b/src/relations/async_replication.py index 7b6ecf8bc0..1700de12b8 100644 --- a/src/relations/async_replication.py +++ b/src/relations/async_replication.py @@ -104,9 +104,6 @@ def __init__(self, charm): self.framework.observe( self.charm.on.create_replication_action, self._on_create_replication ) - self.framework.observe( - self.charm.on.promote_to_primary_action, self._on_promote_to_primary - ) self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed) @@ -575,7 +572,7 @@ def _on_create_replication(self, event: ActionEvent) -> None: # Set the status. self.charm.unit.status = MaintenanceStatus("Creating replication...") - def _on_promote_to_primary(self, event: ActionEvent) -> None: + def promote_to_primary(self, event: ActionEvent) -> None: """Promote this cluster to the primary cluster.""" if ( self.charm.app.status.message != READ_ONLY_MODE_BLOCKING_MESSAGE diff --git a/tests/integration/ha_tests/test_async_replication.py b/tests/integration/ha_tests/test_async_replication.py index 7facd6b61f..99e482409b 100644 --- a/tests/integration/ha_tests/test_async_replication.py +++ b/tests/integration/ha_tests/test_async_replication.py @@ -240,7 +240,7 @@ async def test_switchover( leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model) assert leader_unit is not None, "No leader unit found" logger.info("promoting the second cluster") - run_action = await leader_unit.run_action("promote-to-primary", **{"force": True}) + run_action = await leader_unit.run_action("promote-to-primary", force=True, scope="cluster") await run_action.wait() assert (run_action.results.get("return-code", None) == 0) or ( run_action.results.get("Code", None) == "0" @@ -295,7 +295,7 @@ async def test_promote_standby( leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME) assert leader_unit is not None, "No leader unit found" logger.info("promoting the first cluster") - run_action = await leader_unit.run_action("promote-to-primary") + run_action = await leader_unit.run_action("promote-to-primary", scope="cluster") await run_action.wait() assert (run_action.results.get("return-code", None) == 0) or ( run_action.results.get("Code", None) == "0" diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index e60a53bbe8..9b8f444e51 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -13,6 +13,7 @@ from pytest_operator.plugin import OpsTest from tenacity import Retrying, stop_after_delay, wait_fixed +from .ha_tests.helpers import get_cluster_roles from .helpers import ( CHARM_BASE, METADATA, @@ -253,6 +254,21 @@ async def test_scale_down_and_up(ops_test: OpsTest): await scale_application(ops_test, APP_NAME, initial_scale) +async def test_switchover_sync_standby(ops_test: OpsTest): + original_roles = await get_cluster_roles( + ops_test, ops_test.model.applications[APP_NAME].units[0].name + ) + run_action = await ops_test.model.units[original_roles["sync_standbys"][0]].run_action( + "promote-to-primary", scope="unit" + ) + await run_action.wait() + await ops_test.model.wait_for_idle(status="active", timeout=200) + new_roles = await get_cluster_roles( + ops_test, ops_test.model.applications[APP_NAME].units[0].name + ) + assert new_roles["primaries"][0] == original_roles["sync_standbys"][0] + + async def test_persist_data_through_graceful_restart(ops_test: OpsTest): """Test data persists through a graceful restart.""" primary = await get_primary(ops_test) diff --git a/tests/unit/test_async_replication.py b/tests/unit/test_async_replication.py index d56d71d98b..cc87a5817d 100644 --- a/tests/unit/test_async_replication.py +++ b/tests/unit/test_async_replication.py @@ -314,7 +314,7 @@ def test_promote_to_primary(harness, relation_name): ) harness.update_relation_data(rel_id, "standby/0", {"unit-address": "10.2.2.10"}) - harness.run_action("promote-to-primary") + harness.run_action("promote-to-primary", {"scope": "cluster"}) assert ( harness.get_relation_data(rel_id, harness.charm.app.name).get( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 06aff155dc..6478a21bea 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -26,7 +26,7 @@ from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm from constants import PEER, SECRET_INTERNAL_LABEL -from patroni import NotReadyError +from patroni import NotReadyError, SwitchoverFailedError, SwitchoverNotSyncError from tests.unit.helpers import _FakeApiError POSTGRESQL_CONTAINER = "postgresql" @@ -1847,3 +1847,42 @@ def test_get_plugins(harness): "insert_username", "moddatetime", ] + + +def test_on_promote_to_primary(harness): + with ( + patch("charm.PostgreSQLAsyncReplication.promote_to_primary") as _promote_to_primary, + patch("charm.Patroni.switchover") as _switchover, + ): + event = Mock() + event.params = {"scope": "cluster"} + + # Cluster + harness.charm._on_promote_to_primary(event) + _promote_to_primary.assert_called_once_with(event) + + # Unit, no force, regular promotion + event.params = {"scope": "unit"} + + harness.charm._on_promote_to_primary(event) + + _switchover.assert_called_once_with("postgresql-k8s/0", wait=False) + + # Unit, no force, switchover failed + event.params = {"scope": "unit"} + _switchover.side_effect = SwitchoverFailedError + + harness.charm._on_promote_to_primary(event) + + event.fail.assert_called_once_with( + "Switchover failed or timed out, check the logs for details" + ) + event.fail.reset_mock() + + # Unit, no force, not sync + event.params = {"scope": "unit"} + _switchover.side_effect = SwitchoverNotSyncError + + harness.charm._on_promote_to_primary(event) + + event.fail.assert_called_once_with("Unit is not sync standby") diff --git a/tests/unit/test_patroni.py b/tests/unit/test_patroni.py index 249db26c1f..63e59c109e 100644 --- a/tests/unit/test_patroni.py +++ b/tests/unit/test_patroni.py @@ -13,7 +13,7 @@ from charm import PostgresqlOperatorCharm from constants import REWIND_USER -from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError +from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError, SwitchoverNotSyncError from tests.helpers import STORAGE_PATH @@ -331,11 +331,9 @@ def test_switchover(harness, patroni): # Test failed switchovers. _post.reset_mock() _get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-1"] - try: + with pytest.raises(SwitchoverFailedError): patroni.switchover("postgresql-k8s/2") assert False - except SwitchoverFailedError: - pass _post.assert_called_once_with( "http://postgresql-k8s-0:8008/switchover", json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"}, @@ -347,11 +345,9 @@ def test_switchover(harness, patroni): _post.reset_mock() _get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-2"] response.status_code = 400 - try: + with pytest.raises(SwitchoverFailedError): patroni.switchover("postgresql-k8s/2") assert False - except SwitchoverFailedError: - pass _post.assert_called_once_with( "http://postgresql-k8s-0:8008/switchover", json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"}, @@ -360,6 +356,14 @@ def test_switchover(harness, patroni): timeout=PATRONI_TIMEOUT, ) + # Test candidate, not sync + response = _post.return_value + response.status_code = 412 + response.text = "candidate name does not match with sync_standby" + with pytest.raises(SwitchoverNotSyncError): + patroni.switchover("candidate") + assert False + def test_member_replication_lag(harness, patroni): with (