Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 2 additions & 52 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,23 +715,6 @@ def _on_peer_relation_changed(self, event: HookEvent):
event.defer()
return

# Restart the workload if it's stuck on the starting state after a timeline divergence
# due to a backup that was restored.
if (
not self.is_primary
and not self.is_standby_leader
and (
self._patroni.member_replication_lag == "unknown"
or int(self._patroni.member_replication_lag) > 1000
)
):
logger.warning("Degraded member detected: reinitialising unit")
self.unit.status = MaintenanceStatus("reinitialising replica")
self._patroni.reinitialize_postgresql()
logger.debug("Deferring on_peer_relation_changed: reinitialising replica")
event.defer()
return

self._start_stop_pgbackrest_service(event)

# This is intended to be executed only when leader is reinitializing S3 connection due to the leader change.
Expand Down Expand Up @@ -1635,7 +1618,8 @@ def _on_update_status(self, _) -> None:
if self.primary_endpoint:
self._update_relation_endpoints()

if self._handle_workload_failures():
if not self._patroni.member_started and self._patroni.is_member_isolated:
self._patroni.restart_patroni()
return

# Update the sync-standby endpoint in the async replication data.
Expand Down Expand Up @@ -1745,40 +1729,6 @@ def _handle_processes_failures(self) -> bool:

return False

def _handle_workload_failures(self) -> bool:
"""Handle workload (Patroni or PostgreSQL) failures.

Returns:
a bool indicating whether the charm performed any action.
"""
# Restart the workload if it's stuck on the starting state after a restart.
try:
is_primary = self.is_primary
is_standby_leader = self.is_standby_leader
except RetryError:
return False

if (
not self.has_raft_keys()
and not is_primary
and not is_standby_leader
and not self._patroni.member_started
and "postgresql_restarted" in self._peers.data[self.unit]
and self._patroni.member_replication_lag == "unknown"
):
logger.warning("Workload failure detected. Reinitialising unit.")
self.unit.status = MaintenanceStatus("reinitialising replica")
self._patroni.reinitialize_postgresql()
return True

# Restart the service if the current cluster member is isolated from the cluster
# (stuck with the "awaiting for member to start" message).
if not self._patroni.member_started and self._patroni.is_member_isolated:
self._patroni.restart_patroni()
return True

return False

def _set_primary_status_message(self) -> None:
"""Display 'Primary' in the unit status message if the current unit is the primary."""
try:
Expand Down
25 changes: 1 addition & 24 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,6 @@ def test_on_update_status(harness):
) as _set_primary_status_message,
patch("charm.Patroni.restart_patroni") as _restart_patroni,
patch("charm.Patroni.is_member_isolated") as _is_member_isolated,
patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql,
patch(
"charm.Patroni.member_replication_lag", new_callable=PropertyMock
) as _member_replication_lag,
Expand Down Expand Up @@ -979,24 +978,9 @@ def test_on_update_status(harness):
harness.charm.on.update_status.emit()
_set_primary_status_message.assert_called_once()

# Test the reinitialisation of the replica when its lag is unknown
# after a restart.
# Test call to restart when the member is isolated from the cluster.
_set_primary_status_message.reset_mock()
_is_primary.return_value = False
_is_standby_leader.return_value = False
_member_started.return_value = False
_is_member_isolated.return_value = False
_member_replication_lag.return_value = "unknown"
with harness.hooks_disabled():
harness.update_relation_data(
rel_id, harness.charm.unit.name, {"postgresql_restarted": "True"}
)
harness.charm.on.update_status.emit()
_reinitialize_postgresql.assert_called_once()
_restart_patroni.assert_not_called()
_set_primary_status_message.assert_not_called()

# Test call to restart when the member is isolated from the cluster.
_is_member_isolated.return_value = True
with harness.hooks_disabled():
harness.update_relation_data(
Expand All @@ -1013,9 +997,6 @@ def test_on_update_status_after_restore_operation(harness):
patch(
"charm.PostgresqlOperatorCharm._set_primary_status_message"
) as _set_primary_status_message,
patch(
"charm.PostgresqlOperatorCharm._handle_workload_failures"
) as _handle_workload_failures,
patch(
"charm.PostgresqlOperatorCharm._update_relation_endpoints"
) as _update_relation_endpoints,
Expand Down Expand Up @@ -1052,7 +1033,6 @@ def test_on_update_status_after_restore_operation(harness):
_handle_processes_failures.assert_not_called()
_oversee_users.assert_not_called()
_update_relation_endpoints.assert_not_called()
_handle_workload_failures.assert_not_called()
_set_primary_status_message.assert_not_called()
assert isinstance(harness.charm.unit.status, BlockedStatus)

Expand All @@ -1065,7 +1045,6 @@ def test_on_update_status_after_restore_operation(harness):
_handle_processes_failures.assert_not_called()
_oversee_users.assert_not_called()
_update_relation_endpoints.assert_not_called()
_handle_workload_failures.assert_not_called()
_set_primary_status_message.assert_not_called()
assert isinstance(harness.charm.unit.status, ActiveStatus)

Expand All @@ -1079,13 +1058,11 @@ def test_on_update_status_after_restore_operation(harness):
_member_started.return_value = True
_can_use_s3_repository.return_value = (True, None)
_handle_processes_failures.return_value = False
_handle_workload_failures.return_value = False
harness.charm.on.update_status.emit()
_update_config.assert_called_once()
_handle_processes_failures.assert_called_once()
_oversee_users.assert_called_once()
_update_relation_endpoints.assert_called_once()
_handle_workload_failures.assert_called_once()
_set_primary_status_message.assert_called_once()
assert isinstance(harness.charm.unit.status, ActiveStatus)

Expand Down