Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ list-backups:
pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
promote-to-primary:
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit.
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster
or against the unit to be promoted within the cluster.
params:
scope:
type: string
default: cluster
description: Whether to promote a unit or a cluster. Must be set to either unit or cluster.
force:
type: boolean
description: Force the promotion of a cluster when there is already a primary cluster.
Expand Down
23 changes: 22 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
WORKLOAD_OS_GROUP,
WORKLOAD_OS_USER,
)
from patroni import NotReadyError, Patroni, SwitchoverFailedError
from patroni import NotReadyError, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
from relations.async_replication import (
REPLICATION_CONSUMER_RELATION,
REPLICATION_OFFER_RELATION,
Expand Down Expand Up @@ -211,6 +211,7 @@ def __init__(self, *args):
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
self.framework.observe(self.on.get_primary_action, self._on_get_primary)
self.framework.observe(self.on.update_status, self._on_update_status)
self._storage_path = self.meta.storages["pgdata"].location
Expand Down Expand Up @@ -1305,6 +1306,26 @@ def _on_set_password(self, event: ActionEvent) -> None:

event.set_results({"password": password})

def _on_promote_to_primary(self, event: ActionEvent) -> None:
if event.params.get("scope") == "cluster":
return self.async_replication.promote_to_primary(event)
elif event.params.get("scope") == "unit":
return self.promote_primary_unit(event)
else:
event.fail("Scope should be either cluster or unit")

def promote_primary_unit(self, event: ActionEvent) -> None:
"""Handles promote to primary for unit scope."""
if event.params.get("force"):
event.fail("Suprerfluous force flag with unit scope")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No RAFT, so no need to force.

else:
try:
self._patroni.switchover(self.unit.name, wait=False)
except SwitchoverNotSyncError:
event.fail("Unit is not sync standby")
except SwitchoverFailedError:
event.fail("Switchover failed or timed out, check the logs for details")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If wait=False, can this catched error be due to time out at all? (as implied by the error message)


def _on_get_primary(self, event: ActionEvent) -> None:
"""Get primary instance."""
try:
Expand Down
16 changes: 15 additions & 1 deletion src/patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ class SwitchoverFailedError(Exception):
"""Raised when a switchover failed for some reason."""


class SwitchoverNotSyncError(SwitchoverFailedError):
"""Raised when a switchover failed because node is not sync."""


class UpdateSyncNodeCountError(Exception):
"""Raised when updating synchronous_node_count failed for some reason."""

Expand Down Expand Up @@ -612,7 +616,7 @@ def restart_postgresql(self) -> None:
timeout=PATRONI_TIMEOUT,
)

def switchover(self, candidate: str | None = None) -> None:
def switchover(self, candidate: str | None = None, wait: bool = True) -> None:
"""Trigger a switchover."""
# Try to trigger the switchover.
if candidate is not None:
Expand All @@ -631,8 +635,18 @@ def switchover(self, candidate: str | None = None) -> None:

# Check whether the switchover was unsuccessful.
if r.status_code != 200:
if (
r.status_code == 412
and r.text == "candidate name does not match with sync_standby"
):
logger.debug("Unit is not sync standby")
raise SwitchoverNotSyncError()
logger.warning(f"Switchover call failed with code {r.status_code} {r.text}")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Log Patroni's error code, the body should contain the reason.

raise SwitchoverFailedError(f"received {r.status_code}")

if not wait:
return
Comment on lines +647 to +648
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When running the action we shouldn't wait for the switchover itself to happen.


for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3), reraise=True):
with attempt:
new_primary = self.get_primary()
Expand Down
5 changes: 1 addition & 4 deletions src/relations/async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ def __init__(self, charm):
self.framework.observe(
self.charm.on.create_replication_action, self._on_create_replication
)
self.framework.observe(
self.charm.on.promote_to_primary_action, self._on_promote_to_primary
)

self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed)

Expand Down Expand Up @@ -575,7 +572,7 @@ def _on_create_replication(self, event: ActionEvent) -> None:
# Set the status.
self.charm.unit.status = MaintenanceStatus("Creating replication...")

def _on_promote_to_primary(self, event: ActionEvent) -> None:
def promote_to_primary(self, event: ActionEvent) -> None:
"""Promote this cluster to the primary cluster."""
if (
self.charm.app.status.message != READ_ONLY_MODE_BLOCKING_MESSAGE
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/ha_tests/test_async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ async def test_switchover(
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME, model=second_model)
assert leader_unit is not None, "No leader unit found"
logger.info("promoting the second cluster")
run_action = await leader_unit.run_action("promote-to-primary", **{"force": True})
run_action = await leader_unit.run_action("promote-to-primary", force=True, scope="cluster")
await run_action.wait()
assert (run_action.results.get("return-code", None) == 0) or (
run_action.results.get("Code", None) == "0"
Expand Down Expand Up @@ -295,7 +295,7 @@ async def test_promote_standby(
leader_unit = await get_leader_unit(ops_test, DATABASE_APP_NAME)
assert leader_unit is not None, "No leader unit found"
logger.info("promoting the first cluster")
run_action = await leader_unit.run_action("promote-to-primary")
run_action = await leader_unit.run_action("promote-to-primary", scope="cluster")
await run_action.wait()
assert (run_action.results.get("return-code", None) == 0) or (
run_action.results.get("Code", None) == "0"
Expand Down
16 changes: 16 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_delay, wait_fixed

from .ha_tests.helpers import get_cluster_roles
from .helpers import (
CHARM_BASE,
METADATA,
Expand Down Expand Up @@ -253,6 +254,21 @@ async def test_scale_down_and_up(ops_test: OpsTest):
await scale_application(ops_test, APP_NAME, initial_scale)


async def test_switchover_sync_standby(ops_test: OpsTest):
original_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[APP_NAME].units[0].name
)
run_action = await ops_test.model.units[original_roles["sync_standbys"][0]].run_action(
"promote-to-primary", scope="unit"
)
await run_action.wait()
await ops_test.model.wait_for_idle(status="active", timeout=200)
new_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[APP_NAME].units[0].name
)
assert new_roles["primaries"][0] == original_roles["sync_standbys"][0]


async def test_persist_data_through_graceful_restart(ops_test: OpsTest):
"""Test data persists through a graceful restart."""
primary = await get_primary(ops_test)
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def test_promote_to_primary(harness, relation_name):
)
harness.update_relation_data(rel_id, "standby/0", {"unit-address": "10.2.2.10"})

harness.run_action("promote-to-primary")
harness.run_action("promote-to-primary", {"scope": "cluster"})

assert (
harness.get_relation_data(rel_id, harness.charm.app.name).get(
Expand Down
41 changes: 40 additions & 1 deletion tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

from charm import EXTENSION_OBJECT_MESSAGE, PostgresqlOperatorCharm
from constants import PEER, SECRET_INTERNAL_LABEL
from patroni import NotReadyError
from patroni import NotReadyError, SwitchoverFailedError, SwitchoverNotSyncError
from tests.unit.helpers import _FakeApiError

POSTGRESQL_CONTAINER = "postgresql"
Expand Down Expand Up @@ -1847,3 +1847,42 @@ def test_get_plugins(harness):
"insert_username",
"moddatetime",
]


def test_on_promote_to_primary(harness):
with (
patch("charm.PostgreSQLAsyncReplication.promote_to_primary") as _promote_to_primary,
patch("charm.Patroni.switchover") as _switchover,
):
event = Mock()
event.params = {"scope": "cluster"}

# Cluster
harness.charm._on_promote_to_primary(event)
_promote_to_primary.assert_called_once_with(event)

# Unit, no force, regular promotion
event.params = {"scope": "unit"}

harness.charm._on_promote_to_primary(event)

_switchover.assert_called_once_with("postgresql-k8s/0", wait=False)

# Unit, no force, switchover failed
event.params = {"scope": "unit"}
_switchover.side_effect = SwitchoverFailedError

harness.charm._on_promote_to_primary(event)

event.fail.assert_called_once_with(
"Switchover failed or timed out, check the logs for details"
)
event.fail.reset_mock()

# Unit, no force, not sync
event.params = {"scope": "unit"}
_switchover.side_effect = SwitchoverNotSyncError

harness.charm._on_promote_to_primary(event)

event.fail.assert_called_once_with("Unit is not sync standby")
18 changes: 11 additions & 7 deletions tests/unit/test_patroni.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from charm import PostgresqlOperatorCharm
from constants import REWIND_USER
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError
from patroni import PATRONI_TIMEOUT, Patroni, SwitchoverFailedError, SwitchoverNotSyncError
from tests.helpers import STORAGE_PATH


Expand Down Expand Up @@ -331,11 +331,9 @@ def test_switchover(harness, patroni):
# Test failed switchovers.
_post.reset_mock()
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-1"]
try:
with pytest.raises(SwitchoverFailedError):
patroni.switchover("postgresql-k8s/2")
assert False
except SwitchoverFailedError:
pass
_post.assert_called_once_with(
"http://postgresql-k8s-0:8008/switchover",
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
Expand All @@ -347,11 +345,9 @@ def test_switchover(harness, patroni):
_post.reset_mock()
_get_primary.side_effect = ["postgresql-k8s-0", "postgresql-k8s-2"]
response.status_code = 400
try:
with pytest.raises(SwitchoverFailedError):
patroni.switchover("postgresql-k8s/2")
assert False
except SwitchoverFailedError:
pass
_post.assert_called_once_with(
"http://postgresql-k8s-0:8008/switchover",
json={"leader": "postgresql-k8s-0", "candidate": "postgresql-k8s-2"},
Expand All @@ -360,6 +356,14 @@ def test_switchover(harness, patroni):
timeout=PATRONI_TIMEOUT,
)

# Test candidate, not sync
response = _post.return_value
response.status_code = 412
response.text = "candidate name does not match with sync_standby"
with pytest.raises(SwitchoverNotSyncError):
patroni.switchover("candidate")
assert False


def test_member_replication_lag(harness, patroni):
with (
Expand Down