From 9158ea6e0dd0c78a18ae86ea00aa399011b3a2ae Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 25 Jul 2024 18:49:06 +0300 Subject: [PATCH 01/19] Pause Patroni in the TLS test --- tests/integration/ha_tests/helpers.py | 20 ++++++++++++++++---- tests/integration/test_tls.py | 8 ++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index ae027edc25..0706cae6b6 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -100,21 +100,28 @@ def get_patroni_cluster(unit_ip: str) -> Dict[str, str]: return resp.json() -async def change_patroni_setting(ops_test: OpsTest, setting: str, value: int) -> None: +async def change_patroni_setting( + ops_test: OpsTest, setting: str, value: Union[int, str], tls: bool = False +) -> None: """Change the value of one of the Patroni settings. Args: ops_test: ops_test instance. setting: the name of the setting. value: the value to assign to the setting. + tls: if Patroni is serving using tls. """ + if tls: + schema = "https" + else: + schema = "http" for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): with attempt: app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) unit_ip = await get_unit_address(ops_test, primary_name) requests.patch( - f"http://{unit_ip}:8008/config", + f"{schema}://{unit_ip}:8008/config", json={setting: value}, ) @@ -404,22 +411,27 @@ def get_host_ip(host: str) -> str: return member_ips -async def get_patroni_setting(ops_test: OpsTest, setting: str) -> Optional[int]: +async def get_patroni_setting(ops_test: OpsTest, setting: str, tls: bool = False) -> Optional[int]: """Get the value of one of the integer Patroni settings. Args: ops_test: ops_test instance. setting: the name of the setting. + tls: if Patroni is serving using tls. Returns: the value of the configuration or None if it's using the default value. """ + if tls: + schema = "https" + else: + schema = "http" for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): with attempt: app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) unit_ip = await get_unit_address(ops_test, primary_name) - configuration_info = requests.get(f"http://{unit_ip}:8008/config") + configuration_info = requests.get(f"{schema}://{unit_ip}:8008/config") primary_start_timeout = configuration_info.json().get(setting) return int(primary_start_timeout) if primary_start_timeout is not None else None diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index be512a9c32..8220bb5a55 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -9,6 +9,9 @@ from tenacity import Retrying, stop_after_delay, wait_fixed from . import architecture, markers +from .ha_tests.helpers import ( + change_patroni_setting, +) from .helpers import ( DATABASE_APP_NAME, build_and_deploy, @@ -122,7 +125,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle( apps=[DATABASE_APP_NAME], status="active", idle_period=30 ) + # Pause Patroni so it doesn't wipe the custom changes + await change_patroni_setting(ops_test, "pause", True, use_random_unit=True, tls=True) + async with ops_test.fast_forward("24h"): for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(2), reraise=True): with attempt: # Promote the replica to primary. @@ -165,7 +171,9 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True): with attempt: await check_tls_rewind(ops_test) + await change_patroni_setting(ops_test, "pause", False, use_random_unit=True, tls=True) + async with ops_test.fast_forward(): # Await for postgresql to be stable if not already await ops_test.model.wait_for_idle( apps=[DATABASE_APP_NAME], status="active", idle_period=15 From 3c41be3a1b5f51a3e30a1b209e016e23ad9f2a08 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 25 Jul 2024 19:10:13 +0300 Subject: [PATCH 02/19] Wrong arg --- tests/integration/test_tls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 8220bb5a55..f7e79a13e7 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -126,7 +126,7 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: apps=[DATABASE_APP_NAME], status="active", idle_period=30 ) # Pause Patroni so it doesn't wipe the custom changes - await change_patroni_setting(ops_test, "pause", True, use_random_unit=True, tls=True) + await change_patroni_setting(ops_test, "pause", True, tls=True) async with ops_test.fast_forward("24h"): for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(2), reraise=True): From 8e7db4d93e945a432988bd8da3c4435e848558b4 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 25 Jul 2024 19:59:58 +0300 Subject: [PATCH 03/19] Disable verify --- tests/integration/ha_tests/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 0706cae6b6..9078eea1a6 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -123,6 +123,7 @@ async def change_patroni_setting( requests.patch( f"{schema}://{unit_ip}:8008/config", json={setting: value}, + verify=not tls, ) @@ -431,7 +432,7 @@ async def get_patroni_setting(ops_test: OpsTest, setting: str, tls: bool = False app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) unit_ip = await get_unit_address(ops_test, primary_name) - configuration_info = requests.get(f"{schema}://{unit_ip}:8008/config") + configuration_info = requests.get(f"{schema}://{unit_ip}:8008/config", verify=not tls) primary_start_timeout = configuration_info.json().get(setting) return int(primary_start_timeout) if primary_start_timeout is not None else None From 0b04b40873d999c0b0b5524207834d3df292f0f8 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 25 Jul 2024 21:15:10 +0300 Subject: [PATCH 04/19] Other wrong arg --- tests/integration/test_tls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index f7e79a13e7..0f159d9bb3 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -171,7 +171,7 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True): with attempt: await check_tls_rewind(ops_test) - await change_patroni_setting(ops_test, "pause", False, use_random_unit=True, tls=True) + await change_patroni_setting(ops_test, "pause", False, tls=True) async with ops_test.fast_forward(): # Await for postgresql to be stable if not already From 55925a08c12065c31e9062d2eb06d8bb9157d4e1 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 26 Jul 2024 11:40:10 +0300 Subject: [PATCH 05/19] Retry stopping --- tests/integration/test_tls.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 0f159d9bb3..494695bb0f 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -6,7 +6,7 @@ import pytest as pytest import requests from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_delay, wait_fixed +from tenacity import Retrying, stop_after_attempt, stop_after_delay, wait_fixed from . import architecture, markers from .ha_tests.helpers import ( @@ -159,8 +159,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: connection.close() # Stop the initial primary. - logger.info(f"stopping database on {primary}") - await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql") + for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(5), reraise=True): + with attempt: + logger.info(f"stopping database on {primary}") + await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql") # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" From 7b5cd24cc3bd9dd5638b866d55401dc8c09f1a3c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 26 Jul 2024 13:05:01 +0300 Subject: [PATCH 06/19] Split tls test --- tests/integration/test_tls.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 494695bb0f..57e96c57d1 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -58,7 +58,6 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await build_and_deploy(ops_test, DATABASE_UNITS, wait_for_idle=False) -@pytest.mark.group(1) async def check_tls_rewind(ops_test: OpsTest) -> None: """Checks if TLS was used by rewind.""" for unit in ops_test.model.applications[DATABASE_APP_NAME].units: @@ -79,15 +78,7 @@ async def check_tls_rewind(ops_test: OpsTest) -> None: @pytest.mark.group(1) -@markers.amd64_only # mattermost-k8s charm not available for arm64 -async def test_mattermost_db(ops_test: OpsTest) -> None: - """Deploy Mattermost to test the 'db' relation. - - Mattermost needs TLS enabled on PostgreSQL to correctly connect to it. - - Args: - ops_test: The ops test framework - """ +async def test_tls(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): # Deploy TLS Certificates operator. await ops_test.model.deploy( @@ -175,6 +166,17 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: await check_tls_rewind(ops_test) await change_patroni_setting(ops_test, "pause", False, tls=True) + +@pytest.mark.group(1) +@markers.amd64_only # mattermost-k8s charm not available for arm64 +async def test_mattermost_db(ops_test: OpsTest) -> None: + """Deploy Mattermost to test the 'db' relation. + + Mattermost needs TLS enabled on PostgreSQL to correctly connect to it. + + Args: + ops_test: The ops test framework + """ async with ops_test.fast_forward(): # Await for postgresql to be stable if not already await ops_test.model.wait_for_idle( From c1ad590bddaa0a25811185e2be60e4c4b8b03419 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 26 Jul 2024 13:36:01 +0300 Subject: [PATCH 07/19] Abort on failed rewind test --- tests/integration/test_tls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 57e96c57d1..fefeb18edf 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -78,6 +78,7 @@ async def check_tls_rewind(ops_test: OpsTest) -> None: @pytest.mark.group(1) +@pytest.mark.abort_on_fail async def test_tls(ops_test: OpsTest) -> None: async with ops_test.fast_forward(): # Deploy TLS Certificates operator. From 704ebd7ebde8f80ef36bb31afeb799c38f36ffe4 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 26 Jul 2024 17:51:55 +0300 Subject: [PATCH 08/19] Split off remove TLS test --- tests/integration/test_tls.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index fefeb18edf..1b2d918886 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -194,11 +194,15 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: await check_database_users_existence(ops_test, mattermost_users, []) + +@pytest.mark.group(1) +async def test_remove_tls(ops_test: OpsTest) -> None: + async with ops_test.fast_forward(): # Remove the relation. await ops_test.model.applications[DATABASE_APP_NAME].remove_relation( f"{DATABASE_APP_NAME}:certificates", f"{tls_certificates_app_name}:certificates" ) - await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1000) + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1500) # Wait for all units disabling TLS. for unit in ops_test.model.applications[DATABASE_APP_NAME].units: From a3c2464149f3187b495b0cddcccbbfae4df25019 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 26 Jul 2024 18:31:32 +0300 Subject: [PATCH 09/19] Idle in the rewind test --- tests/integration/test_tls.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index d364a8c564..24cbb8776b 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -165,6 +165,12 @@ async def test_tls(ops_test: OpsTest) -> None: await check_tls_rewind(ops_test) await change_patroni_setting(ops_test, "pause", False, tls=True) + async with ops_test.fast_forward(): + # Await for postgresql to be stable if not already + await ops_test.model.wait_for_idle( + apps=[DATABASE_APP_NAME], status="active", idle_period=15 + ) + @pytest.mark.group(1) @markers.amd64_only # mattermost-k8s charm not available for arm64 @@ -177,11 +183,6 @@ async def test_mattermost_db(ops_test: OpsTest) -> None: ops_test: The ops test framework """ async with ops_test.fast_forward(): - # Await for postgresql to be stable if not already - await ops_test.model.wait_for_idle( - apps=[DATABASE_APP_NAME], status="active", idle_period=15 - ) - # Deploy and check Mattermost user and database existence. relation_id = await deploy_and_relate_application_with_postgresql( ops_test, "mattermost-k8s", MATTERMOST_APP_NAME, APPLICATION_UNITS, status="waiting" From 68948f52226b7c14851067cf0cbecf629883d6c3 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 31 Jul 2024 19:07:04 +0300 Subject: [PATCH 10/19] Catch restart error --- src/charm.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/charm.py b/src/charm.py index 982b25a105..c5dff988d8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1293,15 +1293,20 @@ def _on_stop(self, _): f"failed to patch k8s {type(resource).__name__} {resource.metadata.name}" ) - def _on_update_status(self, _) -> None: - """Update the unit status message.""" + def _on_update_status_early_exit_checks(self, container) -> bool: if not self.upgrade.idle: logger.debug("Early exit on_update_status: upgrade in progress") - return + return False - container = self.unit.get_container("postgresql") if not container.can_connect(): logger.debug("on_update_status early exit: Cannot connect to container") + return False + return True + + def _on_update_status(self, _) -> None: + """Update the unit status message.""" + container = self.unit.get_container("postgresql") + if not self._on_update_status_early_exit_checks(container): return if self._has_blocked_status or self._has_non_restore_waiting_status: @@ -1325,7 +1330,10 @@ def _on_update_status(self, _) -> None: logger.warning( "%s pebble service inactive, restarting service" % self._postgresql_service ) - container.restart(self._postgresql_service) + try: + container.restart(self._postgresql_service) + except ChangeError: + logger.exception("Failed to restart patroni") # If service doesn't recover fast, exit and wait for next hook run to re-check if not self._patroni.member_started: self.unit.status = MaintenanceStatus("Database service inactive, restarting") From 89dba90c52192f59aa1b80c8e085d853c542522c Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Wed, 31 Jul 2024 22:45:20 +0300 Subject: [PATCH 11/19] manually start service --- tests/integration/test_tls.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 24cbb8776b..2b47a83836 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -159,6 +159,9 @@ async def test_tls(ops_test: OpsTest) -> None: # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" + # Restart the initial primary and check the logs to ensure TLS is being used by pg_rewind. + logger.info(f"starting database on {primary}") + await run_command_on_unit(ops_test, primary, "/charm/bin/pebble start postgresql") # Check the logs to ensure TLS is being used by pg_rewind. for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True): with attempt: @@ -201,7 +204,7 @@ async def test_remove_tls(ops_test: OpsTest) -> None: await ops_test.model.applications[DATABASE_APP_NAME].remove_relation( f"{DATABASE_APP_NAME}:certificates", f"{tls_certificates_app_name}:certificates" ) - await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1500) + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=600) # Wait for all units disabling TLS. for unit in ops_test.model.applications[DATABASE_APP_NAME].units: From c0051337c626828ce2fd7628a69bdbdab6623653 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 1 Aug 2024 11:42:02 +0300 Subject: [PATCH 12/19] Revert test tweaks --- tests/integration/test_tls.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 2b47a83836..b6855e9f42 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -159,9 +159,6 @@ async def test_tls(ops_test: OpsTest) -> None: # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" - # Restart the initial primary and check the logs to ensure TLS is being used by pg_rewind. - logger.info(f"starting database on {primary}") - await run_command_on_unit(ops_test, primary, "/charm/bin/pebble start postgresql") # Check the logs to ensure TLS is being used by pg_rewind. for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True): with attempt: @@ -204,7 +201,7 @@ async def test_remove_tls(ops_test: OpsTest) -> None: await ops_test.model.applications[DATABASE_APP_NAME].remove_relation( f"{DATABASE_APP_NAME}:certificates", f"{tls_certificates_app_name}:certificates" ) - await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=600) + await ops_test.model.wait_for_idle(apps=[DATABASE_APP_NAME], status="active", timeout=1000) # Wait for all units disabling TLS. for unit in ops_test.model.applications[DATABASE_APP_NAME].units: From 85812f3083a658957e4b5ddc85397697cc06eb2e Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 1 Aug 2024 11:56:43 +0300 Subject: [PATCH 13/19] Check tls relation when checking for certificates --- src/charm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index c5dff988d8..ac5d3a90c6 100755 --- a/src/charm.py +++ b/src/charm.py @@ -25,7 +25,7 @@ PostgreSQLEnableDisableExtensionError, PostgreSQLUpdateUserPasswordError, ) -from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS +from charms.postgresql_k8s.v0.postgresql_tls import TLS_RELATION, PostgreSQLTLS from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from charms.rolling_ops.v0.rollingops import RollingOpsManager, RunWithLock from charms.tempo_k8s.v1.charm_tracing import trace_charm @@ -1464,7 +1464,7 @@ def is_standby_leader(self) -> bool: @property def is_tls_enabled(self) -> bool: """Return whether TLS is enabled.""" - if not self.model.get_relation(PEER): + if not self.model.get_relation(PEER) or not self.model.get_relation(TLS_RELATION): return False return all(self.tls.get_tls_files()) From bcac1b9a8321b452f1d823ed3a0eb81814ad01c3 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 1 Aug 2024 13:01:31 +0300 Subject: [PATCH 14/19] Update config when restarting leader --- src/charm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/charm.py b/src/charm.py index ac5d3a90c6..62e529ee9f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1661,6 +1661,8 @@ def _restart(self, event: RunWithLock) -> None: try: logger.debug("Restarting PostgreSQL") + if self.unit.is_leader(): + self.update_config() self._patroni.restart_postgresql() except RetryError: error_message = "failed to restart PostgreSQL" From 7eafe84451e3987d7be56063bdfd92266d1c318f Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 1 Aug 2024 17:16:06 +0300 Subject: [PATCH 15/19] Revert restart changes and juju 2 overkill --- src/charm.py | 6 ++---- tests/integration/test_tls.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/charm.py b/src/charm.py index 62e529ee9f..c5dff988d8 100755 --- a/src/charm.py +++ b/src/charm.py @@ -25,7 +25,7 @@ PostgreSQLEnableDisableExtensionError, PostgreSQLUpdateUserPasswordError, ) -from charms.postgresql_k8s.v0.postgresql_tls import TLS_RELATION, PostgreSQLTLS +from charms.postgresql_k8s.v0.postgresql_tls import PostgreSQLTLS from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider from charms.rolling_ops.v0.rollingops import RollingOpsManager, RunWithLock from charms.tempo_k8s.v1.charm_tracing import trace_charm @@ -1464,7 +1464,7 @@ def is_standby_leader(self) -> bool: @property def is_tls_enabled(self) -> bool: """Return whether TLS is enabled.""" - if not self.model.get_relation(PEER) or not self.model.get_relation(TLS_RELATION): + if not self.model.get_relation(PEER): return False return all(self.tls.get_tls_files()) @@ -1661,8 +1661,6 @@ def _restart(self, event: RunWithLock) -> None: try: logger.debug("Restarting PostgreSQL") - if self.unit.is_leader(): - self.update_config() self._patroni.restart_postgresql() except RetryError: error_message = "failed to restart PostgreSQL" diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index b6855e9f42..8b04f74035 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -6,11 +6,12 @@ import pytest as pytest import requests from pytest_operator.plugin import OpsTest -from tenacity import Retrying, stop_after_attempt, stop_after_delay, wait_fixed +from tenacity import Retrying, stop_after_delay, wait_fixed from . import architecture, markers from .ha_tests.helpers import ( change_patroni_setting, + send_signal_to_process, ) from .helpers import ( DATABASE_APP_NAME, @@ -151,10 +152,14 @@ async def test_tls(ops_test: OpsTest) -> None: connection.close() # Stop the initial primary. - for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(5), reraise=True): - with attempt: - logger.info(f"stopping database on {primary}") - await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql") + logger.info(f"stopping database on {primary}") + try: + await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql") + except Exception as e: + # pebble stop on juju 2 errors out and leaves dangling PG processes + if juju_major_version > 2: + raise e + await send_signal_to_process(ops_test, primary, "SIGTERM", "postgres") # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" From cbadff14782a870dfc993e95c5313ec65098fe71 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Thu, 1 Aug 2024 18:00:49 +0300 Subject: [PATCH 16/19] Directly TERM the postgres process --- tests/integration/test_tls.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 8b04f74035..565fe453ec 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -11,7 +11,6 @@ from . import architecture, markers from .ha_tests.helpers import ( change_patroni_setting, - send_signal_to_process, ) from .helpers import ( DATABASE_APP_NAME, @@ -159,7 +158,7 @@ async def test_tls(ops_test: OpsTest) -> None: # pebble stop on juju 2 errors out and leaves dangling PG processes if juju_major_version > 2: raise e - await send_signal_to_process(ops_test, primary, "SIGTERM", "postgres") + await run_command_on_unit(ops_test, primary, "pkill --signal SIGTERM -x postgres") # Check that the primary changed. assert await primary_changed(ops_test, primary), "primary not changed" From 5ec3edfe38b65227c77f0da9193dfb5086a63681 Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 2 Aug 2024 11:30:49 +0300 Subject: [PATCH 17/19] Ignore volume errors --- tests/integration/test_tls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 565fe453ec..aa30978d45 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -87,7 +87,7 @@ async def test_tls(ops_test: OpsTest) -> None: ) # Relate it to the PostgreSQL to enable TLS. await ops_test.model.relate(DATABASE_APP_NAME, tls_certificates_app_name) - await ops_test.model.wait_for_idle(status="active", timeout=1000) + await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False) # Wait for all units enabling TLS. for unit in ops_test.model.applications[DATABASE_APP_NAME].units: From bbc6e77397fb5f1f05e5a355d66dabc3363c1aaf Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 2 Aug 2024 17:50:55 +0300 Subject: [PATCH 18/19] Unit tests --- tests/unit/test_charm.py | 136 ++++++++++++++++++++------------------- 1 file changed, 69 insertions(+), 67 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index d00edef8fe..09ee0a8d7a 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -207,7 +207,7 @@ def test_on_postgresql_pebble_ready(harness): # Check that the initial plan is empty. harness.set_can_connect(POSTGRESQL_CONTAINER, True) plan = harness.get_container_pebble_plan(POSTGRESQL_CONTAINER) - tc.assertEqual(plan.to_dict(), {}) + assert plan.to_dict() == {} # Get the current and the expected layer from the pebble plan and the _postgresql_layer # method, respectively. @@ -217,13 +217,13 @@ def test_on_postgresql_pebble_ready(harness): # Check for a Waiting status when the primary k8s endpoint is not ready yet. harness.container_pebble_ready(POSTGRESQL_CONTAINER) _create_pgdata.assert_called_once() - tc.assertTrue(isinstance(harness.model.unit.status, WaitingStatus)) + assert isinstance(harness.model.unit.status, WaitingStatus) _set_active_status.assert_not_called() # Check for a Blocked status when a failure happens. _idle.return_value = True harness.container_pebble_ready(POSTGRESQL_CONTAINER) - tc.assertTrue(isinstance(harness.model.unit.status, BlockedStatus)) + assert isinstance(harness.model.unit.status, BlockedStatus) _set_active_status.assert_not_called() # No error when upgrading the cluster. @@ -240,10 +240,10 @@ def test_on_postgresql_pebble_ready(harness): expected.pop("summary", "") expected.pop("description", "") # Check the plan is as expected. - tc.assertEqual(plan.to_dict(), expected) + assert plan.to_dict() == expected _set_active_status.assert_called_once() container = harness.model.unit.get_container(POSTGRESQL_CONTAINER) - tc.assertEqual(container.get_service("postgresql").is_running(), True) + assert container.get_service("postgresql").is_running() _push_tls_files_to_workload.assert_called_once() @@ -263,7 +263,7 @@ def test_on_postgresql_pebble_ready_no_connection(harness): # Event was deferred and status is still maintenance mock_event.defer.assert_called_once() mock_event.set_results.assert_not_called() - tc.assertIsInstance(harness.model.unit.status, MaintenanceStatus) + assert isinstance(harness.model.unit.status, MaintenanceStatus) def test_on_config_changed(harness): @@ -409,13 +409,13 @@ def test_on_set_password(harness): # Test without providing the username option. harness.charm._on_set_password(mock_event) - tc.assertEqual(_set_secret.call_args_list[0][0][1], "operator-password") + assert _set_secret.call_args_list[0][0][1] == "operator-password" # Also test providing the username option. _set_secret.reset_mock() mock_event.params["username"] = "replication" harness.charm._on_set_password(mock_event) - tc.assertEqual(_set_secret.call_args_list[0][0][1], "replication-password") + assert _set_secret.call_args_list[0][0][1] == "replication-password" # And test providing both the username and password options. _set_secret.reset_mock() @@ -449,15 +449,24 @@ def test_fail_to_get_primary(harness): @patch_network_get(private_address="1.1.1.1") def test_on_update_status(harness): with ( + patch("charm.logger") as _logger, patch( "charm.PostgresqlOperatorCharm._handle_processes_failures" ) as _handle_processes_failures, patch("charm.Patroni.member_started") as _member_started, patch("charm.Patroni.get_primary") as _get_primary, patch("ops.model.Container.pebble") as _pebble, - patch("ops.model.Container.restart", new_callable=PropertyMock), + patch("ops.model.Container.restart") as _restart, patch("upgrade.PostgreSQLUpgrade.idle", return_value="idle"), ): + # Early exit on can connect. + harness.set_can_connect(POSTGRESQL_CONTAINER, False) + harness.charm.on.update_status.emit() + _get_primary.assert_not_called() + _logger.debug.assert_called_once_with( + "on_update_status early exit: Cannot connect to container" + ) + # Test before the PostgreSQL service is available. _pebble.get_services.return_value = [] harness.set_can_connect(POSTGRESQL_CONTAINER, True) @@ -474,6 +483,19 @@ def test_on_update_status(harness): _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] harness.charm.on.update_status.emit() _get_primary.assert_not_called() + _restart.assert_called_once_with("postgresql") + _restart.reset_mock() + + # Test restart failed + _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.INACTIVE)] + _restart.side_effect = ChangeError(err=None, change=None) + harness.charm.on.update_status.emit() + _get_primary.assert_not_called() + _restart.assert_called_once_with("postgresql") + _logger.exception.assert_called_once_with("Failed to restart patroni") + _restart.reset_mock() + _logger.exception.reset_mock() + _restart.side_effect = None # Check primary message not being set (current unit is not the primary). _pebble.get_services.return_value = [MagicMock(current=ServiceStatus.ACTIVE)] @@ -484,17 +506,11 @@ def test_on_update_status(harness): ] harness.charm.on.update_status.emit() _get_primary.assert_called_once() - tc.assertNotEqual( - harness.model.unit.status, - ActiveStatus("Primary"), - ) + assert harness.model.unit.status != ActiveStatus("Primary") # Test again and check primary message being set (current unit is the primary). harness.charm.on.update_status.emit() - tc.assertEqual( - harness.model.unit.status, - ActiveStatus("Primary"), - ) + assert harness.model.unit.status == ActiveStatus("Primary") def test_on_update_status_no_connection(harness): @@ -707,7 +723,7 @@ def test_on_update_status_after_restore_operation(harness): _update_config.assert_not_called() _handle_processes_failures.assert_not_called() _set_active_status.assert_not_called() - tc.assertIsInstance(harness.charm.unit.status, BlockedStatus) + assert isinstance(harness.charm.unit.status, BlockedStatus) # Test when the restore operation hasn't finished yet. harness.charm.unit.status = ActiveStatus() @@ -733,10 +749,10 @@ def test_on_update_status_after_restore_operation(harness): _update_config.assert_called_once() _handle_processes_failures.assert_called_once() _set_active_status.assert_called_once() - tc.assertIsInstance(harness.charm.unit.status, ActiveStatus) + assert isinstance(harness.charm.unit.status, ActiveStatus) # Assert that the backup id is not in the application relation databag anymore. - tc.assertEqual(harness.get_relation_data(rel_id, harness.charm.app), {}) + assert harness.get_relation_data(rel_id, harness.charm.app) == {} # Test when it's not possible to use the configured S3 repository. _update_config.reset_mock() @@ -753,11 +769,11 @@ def test_on_update_status_after_restore_operation(harness): _update_config.assert_called_once() _handle_processes_failures.assert_not_called() _set_active_status.assert_not_called() - tc.assertIsInstance(harness.charm.unit.status, BlockedStatus) - tc.assertEqual(harness.charm.unit.status.message, "fake validation message") + assert isinstance(harness.charm.unit.status, BlockedStatus) + assert harness.charm.unit.status.message == "fake validation message" # Assert that the backup id is not in the application relation databag anymore. - tc.assertEqual(harness.get_relation_data(rel_id, harness.charm.app), {}) + assert harness.get_relation_data(rel_id, harness.charm.app) == {} def test_on_upgrade_charm(harness): @@ -781,7 +797,7 @@ def test_on_upgrade_charm(harness): harness.charm.on.upgrade_charm.emit() _create_services.assert_not_called() _patch_pod_labels.assert_called_once() - tc.assertTrue(isinstance(harness.charm.unit.status, ActiveStatus)) + assert isinstance(harness.charm.unit.status, ActiveStatus) # Test with a problem happening when trying to create the k8s resources. _patch_pod_labels.reset_mock() @@ -789,7 +805,7 @@ def test_on_upgrade_charm(harness): harness.charm.on.upgrade_charm.emit() _create_services.assert_called_once() _patch_pod_labels.assert_not_called() - tc.assertTrue(isinstance(harness.charm.unit.status, BlockedStatus)) + assert isinstance(harness.charm.unit.status, BlockedStatus) # Test a successful k8s resources creation, but unsuccessful pod patch operation. _create_services.reset_mock() @@ -797,7 +813,7 @@ def test_on_upgrade_charm(harness): harness.charm.on.upgrade_charm.emit() _create_services.assert_called_once() _patch_pod_labels.assert_called_once() - tc.assertTrue(isinstance(harness.charm.unit.status, BlockedStatus)) + assert isinstance(harness.charm.unit.status, BlockedStatus) # Test a successful k8s resources creation and the operation to patch the pod. _create_services.reset_mock() @@ -806,8 +822,7 @@ def test_on_upgrade_charm(harness): harness.charm.on.upgrade_charm.emit() _create_services.assert_called_once() _patch_pod_labels.assert_called_once() - tc.assertFalse(isinstance(harness.charm.unit.status, BlockedStatus)) - + assert not isinstance(harness.charm.unit.status, BlockedStatus) def test_create_services(harness): with patch("charm.Client") as _client: @@ -932,7 +947,7 @@ def test_postgresql_layer(harness): } }, } - tc.assertDictEqual(plan, expected) + assert plan == expected def test_on_stop(harness): @@ -968,10 +983,8 @@ def test_on_stop(harness): namespace=harness.charm.model.name, labels={"app.juju.is/created-by": harness.charm.app.name}, ) - tc.assertEqual(_client.return_value.apply.call_count, 2) - tc.assertEqual( - harness.get_relation_data(rel_id, harness.charm.unit), relation_data - ) + assert _client.return_value.apply.call_count == 2 + assert harness.get_relation_data(rel_id, harness.charm.unit) == relation_data _client.reset_mock() # Test when the charm fails to get first pod info. @@ -1018,7 +1031,7 @@ def test_on_stop(harness): def test_client_relations(harness): # Test when the charm has no relations. - tc.assertEqual(harness.charm.client_relations, []) + assert harness.charm.client_relations == [] # Test when the charm has some relations. harness.add_relation("database", "application") @@ -1027,9 +1040,7 @@ def test_client_relations(harness): database_relation = harness.model.get_relation("database") db_relation = harness.model.get_relation("db") db_admin_relation = harness.model.get_relation("db-admin") - tc.assertEqual( - harness.charm.client_relations, [database_relation, db_relation, db_admin_relation] - ) + assert harness.charm.client_relations == [database_relation, db_relation, db_admin_relation] def test_validate_config_options(harness): @@ -1185,8 +1196,9 @@ def test_set_secret_in_databag(harness, only_without_juju_secrets): harness.charm.set_secret("unit", "password", None) assert "password" not in harness.get_relation_data(rel_id, harness.charm.unit.name) - with tc.assertRaises(RuntimeError): + with pytest.raises(RuntimeError): harness.charm.set_secret("test", "password", "test") + assert False @pytest.mark.parametrize("scope,is_leader", [("app", True), ("unit", True), ("unit", False)]) @@ -1453,12 +1465,9 @@ def test_on_peer_relation_changed(harness): with harness.hooks_disabled(): harness.update_relation_data(rel_id, harness.charm.unit.name, {"start-tls-server": ""}) harness.charm.on.database_peers_relation_changed.emit(relation) - tc.assertEqual( - harness.get_relation_data(rel_id, harness.charm.unit), - {"start-tls-server": "True"}, - ) + assert harness.get_relation_data(rel_id, harness.charm.unit) == {"start-tls-server": "True"} _defer.assert_called_once() - tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) + assert isinstance(harness.charm.unit.status, MaintenanceStatus) _set_active_status.assert_not_called() # Test the status being changed when it was possible to start the @@ -1466,10 +1475,7 @@ def test_on_peer_relation_changed(harness): _defer.reset_mock() _start_stop_pgbackrest_service.return_value = True harness.charm.on.database_peers_relation_changed.emit(relation) - tc.assertEqual( - harness.get_relation_data(rel_id, harness.charm.unit), - {}, - ) + assert harness.get_relation_data(rel_id, harness.charm.unit) == {} _defer.assert_not_called() _set_active_status.assert_called_once() @@ -1477,7 +1483,7 @@ def test_on_peer_relation_changed(harness): _set_active_status.reset_mock() harness.charm.unit.status = BlockedStatus() harness.charm.on.database_peers_relation_changed.emit(relation) - tc.assertIsInstance(harness.charm.unit.status, BlockedStatus) + assert isinstance(harness.charm.unit.status, BlockedStatus) _set_active_status.assert_not_called() @@ -1510,7 +1516,7 @@ def test_handle_processes_failures(harness): _is_database_running.return_value = values[2] _is_primary.return_value = values[3] _member_streaming.return_value = values[4] - tc.assertFalse(harness.charm._handle_processes_failures()) + assert not harness.charm._handle_processes_failures() _restart.assert_not_called() _reinitialize_postgresql.assert_not_called() @@ -1545,8 +1551,8 @@ def test_handle_processes_failures(harness): _member_streaming.return_value = values[3] harness.charm.unit.status = ActiveStatus() result = harness.charm._handle_processes_failures() - tc.assertTrue(result) if values[0] is None else tc.assertFalse(result) - tc.assertIsInstance(harness.charm.unit.status, ActiveStatus) + assert result == (values[0] is None) + assert isinstance(harness.charm.unit.status, ActiveStatus) _restart.assert_called_once_with("postgresql") _reinitialize_postgresql.assert_not_called() @@ -1568,8 +1574,8 @@ def test_handle_processes_failures(harness): _is_database_running.return_value = values[2] harness.charm.unit.status = ActiveStatus() result = harness.charm._handle_processes_failures() - tc.assertTrue(result) if values[0] is None else tc.assertFalse(result) - tc.assertIsInstance( + assert result == (values[0] is None) + assert isinstance( harness.charm.unit.status, MaintenanceStatus if values[0] is None else ActiveStatus ) _restart.assert_not_called() @@ -1614,8 +1620,9 @@ def test_update_config(harness): # Test when the two config options for profile limit memory are set at the same time. _render_patroni_yml_file.reset_mock() harness.update_config({"profile-limit-memory": 1000}) - with tc.assertRaises(ValueError): + with pytest.raises(ValueError): harness.charm.update_config() + assert False # Test without TLS files available. harness.update_config(unset={"profile-limit-memory", "profile_limit_memory"}) @@ -1637,7 +1644,7 @@ def test_update_config(harness): parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once() - tc.assertNotIn("tls", harness.get_relation_data(rel_id, harness.charm.unit.name)) + assert "tls" not in harness.get_relation_data(rel_id, harness.charm.unit.name) # Test with TLS files available. _handle_postgresql_restart_need.reset_mock() @@ -1661,12 +1668,9 @@ def test_update_config(harness): parameters={"test": "test"}, ) _handle_postgresql_restart_need.assert_called_once() - tc.assertNotIn( - "tls", - harness.get_relation_data( + assert "tls" not in harness.get_relation_data( rel_id, harness.charm.unit.name - ), # The "tls" flag is set in handle_postgresql_restart_need. - ) + ) # The "tls" flag is set in handle_postgresql_restart_need. # Test with workload not running yet. harness.update_relation_data( @@ -1675,9 +1679,7 @@ def test_update_config(harness): _handle_postgresql_restart_need.reset_mock() harness.charm.update_config() _handle_postgresql_restart_need.assert_not_called() - tc.assertEqual( - harness.get_relation_data(rel_id, harness.charm.unit.name)["tls"], "enabled" - ) + harness.get_relation_data(rel_id, harness.charm.unit.name)["tls"] == "enabled" # Test with member not started yet. harness.update_relation_data( @@ -1685,7 +1687,7 @@ def test_update_config(harness): ) # Mock some data in the relation to test that it doesn't change. harness.charm.update_config() _handle_postgresql_restart_need.assert_not_called() - tc.assertNotIn("tls", harness.get_relation_data(rel_id, harness.charm.unit.name)) + assert "tls" not in harness.get_relation_data(rel_id, harness.charm.unit.name) def test_handle_postgresql_restart_need(harness): @@ -1757,12 +1759,12 @@ def test_set_active_status(harness): _is_standby_leader.side_effect = values[1] _is_standby_leader.return_value = None harness.charm._set_active_status() - tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) + assert isinstance(harness.charm.unit.status, MaintenanceStatus) else: _is_standby_leader.side_effect = None _is_standby_leader.return_value = values[1] harness.charm._set_active_status() - tc.assertIsInstance( + assert isinstance( harness.charm.unit.status, ActiveStatus if values[0] == harness.charm.unit.name or values[1] or values[2] @@ -1778,7 +1780,7 @@ def test_set_active_status(harness): _get_primary.side_effect = values[0] _get_primary.return_value = None harness.charm._set_active_status() - tc.assertIsInstance(harness.charm.unit.status, MaintenanceStatus) + assert isinstance(harness.charm.unit.status, MaintenanceStatus) def test_create_pgdata(harness): From 691133e716febab7500784fefcadd56337368d5a Mon Sep 17 00:00:00 2001 From: Dragomir Penev Date: Fri, 2 Aug 2024 18:56:06 +0300 Subject: [PATCH 19/19] Lint --- tests/unit/test_charm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 09ee0a8d7a..9667e3f43f 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -824,6 +824,7 @@ def test_on_upgrade_charm(harness): _patch_pod_labels.assert_called_once() assert not isinstance(harness.charm.unit.status, BlockedStatus) + def test_create_services(harness): with patch("charm.Client") as _client: # Test the successful creation of the resources. @@ -1465,7 +1466,9 @@ def test_on_peer_relation_changed(harness): with harness.hooks_disabled(): harness.update_relation_data(rel_id, harness.charm.unit.name, {"start-tls-server": ""}) harness.charm.on.database_peers_relation_changed.emit(relation) - assert harness.get_relation_data(rel_id, harness.charm.unit) == {"start-tls-server": "True"} + assert harness.get_relation_data(rel_id, harness.charm.unit) == { + "start-tls-server": "True" + } _defer.assert_called_once() assert isinstance(harness.charm.unit.status, MaintenanceStatus) _set_active_status.assert_not_called() @@ -1669,8 +1672,8 @@ def test_update_config(harness): ) _handle_postgresql_restart_need.assert_called_once() assert "tls" not in harness.get_relation_data( - rel_id, harness.charm.unit.name - ) # The "tls" flag is set in handle_postgresql_restart_need. + rel_id, harness.charm.unit.name + ) # The "tls" flag is set in handle_postgresql_restart_need. # Test with workload not running yet. harness.update_relation_data(