From 512ee0b58ce211793445c33c1920ca20fb1a69e9 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 19 Sep 2025 15:32:58 +0300 Subject: [PATCH] Remove reinits --- src/charm.py | 54 ++-------------------------------------- tests/unit/test_charm.py | 25 +------------------ 2 files changed, 3 insertions(+), 76 deletions(-) diff --git a/src/charm.py b/src/charm.py index e979ea81d7..3b2bb9bb2f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -715,23 +715,6 @@ def _on_peer_relation_changed(self, event: HookEvent): event.defer() return - # Restart the workload if it's stuck on the starting state after a timeline divergence - # due to a backup that was restored. - if ( - not self.is_primary - and not self.is_standby_leader - and ( - self._patroni.member_replication_lag == "unknown" - or int(self._patroni.member_replication_lag) > 1000 - ) - ): - logger.warning("Degraded member detected: reinitialising unit") - self.unit.status = MaintenanceStatus("reinitialising replica") - self._patroni.reinitialize_postgresql() - logger.debug("Deferring on_peer_relation_changed: reinitialising replica") - event.defer() - return - self._start_stop_pgbackrest_service(event) # This is intended to be executed only when leader is reinitializing S3 connection due to the leader change. @@ -1635,7 +1618,8 @@ def _on_update_status(self, _) -> None: if self.primary_endpoint: self._update_relation_endpoints() - if self._handle_workload_failures(): + if not self._patroni.member_started and self._patroni.is_member_isolated: + self._patroni.restart_patroni() return # Update the sync-standby endpoint in the async replication data. @@ -1745,40 +1729,6 @@ def _handle_processes_failures(self) -> bool: return False - def _handle_workload_failures(self) -> bool: - """Handle workload (Patroni or PostgreSQL) failures. - - Returns: - a bool indicating whether the charm performed any action. - """ - # Restart the workload if it's stuck on the starting state after a restart. - try: - is_primary = self.is_primary - is_standby_leader = self.is_standby_leader - except RetryError: - return False - - if ( - not self.has_raft_keys() - and not is_primary - and not is_standby_leader - and not self._patroni.member_started - and "postgresql_restarted" in self._peers.data[self.unit] - and self._patroni.member_replication_lag == "unknown" - ): - logger.warning("Workload failure detected. Reinitialising unit.") - self.unit.status = MaintenanceStatus("reinitialising replica") - self._patroni.reinitialize_postgresql() - return True - - # Restart the service if the current cluster member is isolated from the cluster - # (stuck with the "awaiting for member to start" message). - if not self._patroni.member_started and self._patroni.is_member_isolated: - self._patroni.restart_patroni() - return True - - return False - def _set_primary_status_message(self) -> None: """Display 'Primary' in the unit status message if the current unit is the primary.""" try: diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9c8b9efc4d..db5a67d1b8 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -907,7 +907,6 @@ def test_on_update_status(harness): ) as _set_primary_status_message, patch("charm.Patroni.restart_patroni") as _restart_patroni, patch("charm.Patroni.is_member_isolated") as _is_member_isolated, - patch("charm.Patroni.reinitialize_postgresql") as _reinitialize_postgresql, patch( "charm.Patroni.member_replication_lag", new_callable=PropertyMock ) as _member_replication_lag, @@ -979,24 +978,9 @@ def test_on_update_status(harness): harness.charm.on.update_status.emit() _set_primary_status_message.assert_called_once() - # Test the reinitialisation of the replica when its lag is unknown - # after a restart. + # Test call to restart when the member is isolated from the cluster. _set_primary_status_message.reset_mock() - _is_primary.return_value = False - _is_standby_leader.return_value = False _member_started.return_value = False - _is_member_isolated.return_value = False - _member_replication_lag.return_value = "unknown" - with harness.hooks_disabled(): - harness.update_relation_data( - rel_id, harness.charm.unit.name, {"postgresql_restarted": "True"} - ) - harness.charm.on.update_status.emit() - _reinitialize_postgresql.assert_called_once() - _restart_patroni.assert_not_called() - _set_primary_status_message.assert_not_called() - - # Test call to restart when the member is isolated from the cluster. _is_member_isolated.return_value = True with harness.hooks_disabled(): harness.update_relation_data( @@ -1013,9 +997,6 @@ def test_on_update_status_after_restore_operation(harness): patch( "charm.PostgresqlOperatorCharm._set_primary_status_message" ) as _set_primary_status_message, - patch( - "charm.PostgresqlOperatorCharm._handle_workload_failures" - ) as _handle_workload_failures, patch( "charm.PostgresqlOperatorCharm._update_relation_endpoints" ) as _update_relation_endpoints, @@ -1052,7 +1033,6 @@ def test_on_update_status_after_restore_operation(harness): _handle_processes_failures.assert_not_called() _oversee_users.assert_not_called() _update_relation_endpoints.assert_not_called() - _handle_workload_failures.assert_not_called() _set_primary_status_message.assert_not_called() assert isinstance(harness.charm.unit.status, BlockedStatus) @@ -1065,7 +1045,6 @@ def test_on_update_status_after_restore_operation(harness): _handle_processes_failures.assert_not_called() _oversee_users.assert_not_called() _update_relation_endpoints.assert_not_called() - _handle_workload_failures.assert_not_called() _set_primary_status_message.assert_not_called() assert isinstance(harness.charm.unit.status, ActiveStatus) @@ -1079,13 +1058,11 @@ def test_on_update_status_after_restore_operation(harness): _member_started.return_value = True _can_use_s3_repository.return_value = (True, None) _handle_processes_failures.return_value = False - _handle_workload_failures.return_value = False harness.charm.on.update_status.emit() _update_config.assert_called_once() _handle_processes_failures.assert_called_once() _oversee_users.assert_called_once() _update_relation_endpoints.assert_called_once() - _handle_workload_failures.assert_called_once() _set_primary_status_message.assert_called_once() assert isinstance(harness.charm.unit.status, ActiveStatus)