canonical
diff --git a/‎.github/workflows/integration_test.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/integration_test.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/charms/postgresql_k8s/v1/postgresql.py‎
Lines changed: 4 additions & 18 deletions b/‎lib/charms/postgresql_k8s/v1/postgresql.py‎
Lines changed: 4 additions & 18 deletions
diff --git a/‎src/charm.py‎
Lines changed: 34 additions & 57 deletions b/‎src/charm.py‎
Lines changed: 34 additions & 57 deletions
@@ -84,10 +84,10 @@ jobs:
     needs:
       - collect-integration-tests
     runs-on: ${{ matrix.job.runner }}
-    timeout-minutes: 217  # Sum of steps `timeout-minutes` + 5
+    timeout-minutes: 219  # Sum of steps `timeout-minutes` + 5
     steps:
       - name: Free up disk space
-        timeout-minutes: 1
+        timeout-minutes: 3
         run: |
           printf '\nDisk usage before cleanup\n'
           df --human-readable
 
@@ -273,8 +273,11 @@ def _connect_to_database(
             raise PostgreSQLUndefinedHostError("Host not set")
         if not self.password:
             raise PostgreSQLUndefinedPasswordError("Password not set")
+
+        dbname = database if database else self.database
+        logger.debug(f"New DB connection: dbname='{dbname}' user='{self.user}' host='{host}' connect_timeout=1")
         connection = psycopg2.connect(
-            f"dbname='{database if database else self.database}' user='{self.user}' host='{host}'"
+            f"dbname='{dbname}' user='{self.user}' host='{host}'"
             f"password='{self.password}' connect_timeout=1"
         )
         connection.autocommit = True
@@ -1322,23 +1325,6 @@ def update_user_password(
             if connection is not None:
                 connection.close()
 
-    def is_restart_pending(self) -> bool:
-        """Query pg_settings for pending restart."""
-        connection = None
-        try:
-            with self._connect_to_database() as connection, connection.cursor() as cursor:
-                cursor.execute("SELECT COUNT(*) FROM pg_settings WHERE pending_restart=True;")
-                return cursor.fetchone()[0] > 0
-        except psycopg2.OperationalError:
-            logger.warning("Failed to connect to PostgreSQL.")
-            return False
-        except psycopg2.Error as e:
-            logger.error(f"Failed to check if restart is pending: {e}")
-            return False
-        finally:
-            if connection:
-                connection.close()
-
     def database_exists(self, db: str) -> bool:
         """Check whether specified database exists."""
         connection = None
 
@@ -177,7 +177,7 @@ def unit_number(unit_name: str):
         # Lowest unit number is last to refresh
         last_unit_to_refresh = sorted(all_units, key=unit_number)[0].replace("/", "-")
         if self._charm._patroni.get_primary() == last_unit_to_refresh:
-            logging.info(
+            logger.info(
                 f"Unit {last_unit_to_refresh} was already primary during pre-refresh check"
             )
         else:
@@ -187,7 +187,7 @@ def unit_number(unit_name: str):
                 logger.warning(f"switchover failed with reason: {e}")
                 raise charm_refresh.PrecheckFailed("Unable to switch primary")
             else:
-                logging.info(
+                logger.info(
                     f"Switched primary to unit {last_unit_to_refresh} during pre-refresh check"
                 )
 
@@ -477,21 +477,6 @@ def patroni_scrape_config(self) -> list[dict]:
             }
         ]
 
-    @property
-    def app_units(self) -> set[Unit]:
-        """The peer-related units in the application."""
-        if not self._peers:
-            return set()
-
-        return {self.unit, *self._peers.units}
-
-    def scoped_peer_data(self, scope: SCOPES) -> dict | None:
-        """Returns peer data based on scope."""
-        if scope == APP_SCOPE:
-            return self.app_peer_data
-        elif scope == UNIT_SCOPE:
-            return self.unit_peer_data
-
     @property
     def app_peer_data(self) -> dict:
         """Application peer relation data object."""
@@ -628,7 +613,6 @@ def postgresql(self) -> PostgreSQL:
         """Returns an instance of the object used to interact with the database."""
         password = str(self.get_secret(APP_SCOPE, f"{USER}-password"))
         if self._postgresql is None or self._postgresql.primary_host is None:
-            logger.debug("Init class PostgreSQL")
             self._postgresql = PostgreSQL(
                 primary_host=self.primary_endpoint,
                 current_host=self._unit_ip,
@@ -655,15 +639,17 @@ def primary_endpoint(self) -> str | None:
             # Force a retry if there is no primary or the member that was
             # returned is not in the list of the current cluster members
             # (like when the cluster was not updated yet after a failed switchover).
-            if not primary_endpoint or primary_endpoint not in self._units_ips:
-                # TODO figure out why peer data is not available
-                if primary_endpoint and len(self._units_ips) == 1 and len(self._peers.units) > 1:
-                    logger.warning(
-                        "Possibly incomplete peer data: Will not map primary IP to unit IP"
-                    )
-                    return primary_endpoint
-                logger.debug("primary endpoint early exit: Primary IP not in cached peer list.")
+            if not primary_endpoint:
+                logger.warning(f"Missing primary IP for {primary}")
                 primary_endpoint = None
+            elif primary_endpoint not in self._units_ips:
+                if len(self._peers.units) == 0:
+                    logger.info(f"The unit didn't join {PEER} relation? Using {primary_endpoint}")
+                elif len(self._units_ips) == 1 and len(self._peers.units) > 1:
+                    logger.warning(f"Possibly incomplete peer data, keep using {primary_endpoint}")
+                else:
+                    logger.debug("Early exit primary_endpoint: Primary IP not in cached peer list")
+                    primary_endpoint = None
         except RetryError:
             return None
         else:
@@ -1348,7 +1334,7 @@ def _on_cluster_topology_change(self, _):
         logger.info("Cluster topology changed")
         if self.primary_endpoint:
             self._update_relation_endpoints()
-            self.set_unit_status(ActiveStatus())
+            self._set_primary_status_message()
 
     def _on_install(self, event: InstallEvent) -> None:
         """Install prerequisites for the application."""
@@ -1460,10 +1446,8 @@ def _on_config_changed(self, event) -> None:  # noqa: C901
             return
 
         if self.refresh is None:
-            logger.debug("Defer on_config_changed: Refresh could be in progress")
-            event.defer()
-            return
-        if self.refresh.in_progress:
+            logger.warning("Warning _on_config_changed: Refresh could be in progress")
+        elif self.refresh.in_progress:
             logger.debug("Defer on_config_changed: Refresh in progress")
             event.defer()
             return
@@ -1585,10 +1569,8 @@ def _can_start(self, event: StartEvent) -> bool:
 
         # Safeguard against starting while refreshing.
         if self.refresh is None:
-            logger.debug("Defer on_start: Refresh could be in progress")
-            event.defer()
-            return False
-        if self.refresh.in_progress:
+            logger.warning("Warning on_start: Refresh could be in progress")
+        elif self.refresh.in_progress:
             # TODO: we should probably start workload if scale up while refresh in progress
             logger.debug("Defer on_start: Refresh in progress")
             event.defer()
@@ -1657,7 +1639,7 @@ def _restart_metrics_service(self, postgres_snap: snap.Snap) -> None:
         try:
             snap_password = postgres_snap.get("exporter.password")
         except snap.SnapError:
-            logger.warning("Early exit: Trying to reset metrics service with no configuration set")
+            logger.warning("Early exit: skipping exporter setup (no configuration set)")
             return None
 
         if snap_password != self.get_secret(APP_SCOPE, MONITORING_PASSWORD_KEY):
@@ -1963,6 +1945,7 @@ def _on_update_status(self, _) -> None:
 
         if not self._patroni.member_started and self._patroni.is_member_isolated:
             self._patroni.restart_patroni()
+            self._observer.start_observer()
             return
 
         # Update the sync-standby endpoint in the async replication data.
@@ -2092,8 +2075,9 @@ def _handle_processes_failures(self) -> bool:
                 logger.info("PostgreSQL data directory was not empty. Moved pg_wal")
                 return True
             try:
-                self._patroni.restart_patroni()
                 logger.info("restarted PostgreSQL because it was not running")
+                self._patroni.restart_patroni()
+                self._observer.start_observer()
                 return True
             except RetryError:
                 logger.error("failed to restart PostgreSQL after checking that it was not running")
@@ -2124,11 +2108,8 @@ def _set_primary_status_message(self) -> None:
                     danger_state = " (read-only)"
                 elif len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
                     danger_state = " (degraded)"
-                self.set_unit_status(
-                    ActiveStatus(
-                        f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
-                    )
-                )
+                unit_status = "Standby" if self.is_standby_leader else "Primary"
+                self.set_unit_status(ActiveStatus(f"{unit_status}{danger_state}"))
             elif self._patroni.member_started:
                 self.set_unit_status(ActiveStatus())
         except (RetryError, ConnectionError) as e:
@@ -2335,12 +2316,13 @@ def _can_connect_to_postgresql(self) -> bool:
         if not self.postgresql.password or not self.postgresql.current_host:
             return False
         try:
-            for attempt in Retrying(stop=stop_after_delay(30), wait=wait_fixed(3)):
+            for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
                 with attempt:
                     if not self.postgresql.get_postgresql_timezones():
+                        logger.debug("Cannot connect to database (CannotConnectError)")
                         raise CannotConnectError
         except RetryError:
-            logger.debug("Cannot connect to database")
+            logger.debug("Cannot connect to database (RetryError)")
             return False
         return True
 
@@ -2381,7 +2363,7 @@ def update_config(
             parameters=pg_parameters,
             no_peers=no_peers,
             user_databases_map=self.relations_user_databases_map,
-            slots=replication_slots or None,
+            slots=replication_slots,
         )
         if no_peers:
             return True
@@ -2489,18 +2471,13 @@ def _handle_postgresql_restart_need(self) -> None:
             self._patroni.reload_patroni_configuration()
         except Exception as e:
             logger.error(f"Reload patroni call failed! error: {e!s}")
-        # Wait for some more time than the Patroni's loop_wait default value (10 seconds),
-        # which tells how much time Patroni will wait before checking the configuration
-        # file again to reload it.
-        try:
-            for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)):
-                with attempt:
-                    restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending()
-                    if not restart_postgresql:
-                        raise Exception
-        except RetryError:
-            # Ignore the error, as it happens only to indicate that the configuration has not changed.
-            pass
+
+        restart_pending = self._patroni.is_restart_pending()
+        logger.debug(
+            f"Checking if restart pending: TLS={restart_postgresql} or API={restart_pending}"
+        )
+        restart_postgresql = restart_postgresql or restart_pending
+
         self.unit_peer_data.update({"tls": "enabled" if self.is_tls_enabled else ""})
         self.postgresql_client_relation.update_endpoints()