From 8fb7576ba10a77744ba30e9357f5ef1a43323dac Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 6 Feb 2024 10:54:58 -0300 Subject: [PATCH 1/8] Fix network cut test Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 5 +- tests/integration/ha_tests/helpers.py | 98 +++++++++++++++---- .../integration/ha_tests/test_self_healing.py | 7 +- 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/src/charm.py b/src/charm.py index 4f64355054..14f7e224fc 100755 --- a/src/charm.py +++ b/src/charm.py @@ -515,12 +515,14 @@ def _reconfigure_cluster(self, event: HookEvent): and event.relation.data[event.unit].get("ip-to-remove") is not None ): ip_to_remove = event.relation.data[event.unit].get("ip-to-remove") + logger.info("Removing %s from the cluster", ip_to_remove) try: self._patroni.remove_raft_member(ip_to_remove) except RemoveRaftMemberFailedError: logger.debug("Deferring on_peer_relation_changed: failed to remove raft member") return False - self._remove_from_members_ips(ip_to_remove) + if ip_to_remove in self.members_ips: + self._remove_from_members_ips(ip_to_remove) self._add_members(event) return True @@ -803,6 +805,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: # Remove departing units when the leader changes. for ip in self._get_ips_to_remove(): + logger.info("Removing %s from the cluster", ip) self._remove_from_members_ips(ip) self.update_config() diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index c9844d41fd..f62286b2a1 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1,5 +1,6 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import logging import os import random import subprocess @@ -22,6 +23,8 @@ from ..helpers import APPLICATION_NAME, db_connect, get_unit_address, run_command_on_unit +logger = logging.getLogger(__name__) + METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) PORT = 5432 APP_NAME = METADATA["name"] @@ -161,14 +164,18 @@ async def change_wal_settings( ) -async def is_cluster_updated(ops_test: OpsTest, primary_name: str) -> None: +async def is_cluster_updated( + ops_test: OpsTest, primary_name: str, use_ip_from_inside: bool = False +) -> None: # Verify that the old primary is now a replica. + logger.info("checking that the former primary is now a replica") assert await is_replica( - ops_test, primary_name + ops_test, primary_name, use_ip_from_inside ), "there are more than one primary in the cluster." # Verify that all units are part of the same cluster. - member_ips = await fetch_cluster_members(ops_test) + logger.info("checking that all units are part of the same cluster") + member_ips = await fetch_cluster_members(ops_test, use_ip_from_inside) app = primary_name.split("/")[0] ip_addresses = [ await get_unit_ip(ops_test, unit.name) for unit in ops_test.model.applications[app].units @@ -176,18 +183,22 @@ async def is_cluster_updated(ops_test: OpsTest, primary_name: str) -> None: assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." # Verify that no writes to the database were missed after stopping the writes. - total_expected_writes = await check_writes(ops_test) + logger.info("checking that no writes to the database were missed after stopping the writes") + total_expected_writes = await check_writes(ops_test, use_ip_from_inside) # Verify that old primary is up-to-date. + logger.info("checking that the former primary is up to date with the cluster after restarting") assert await is_secondary_up_to_date( - ops_test, primary_name, total_expected_writes + ops_test, primary_name, total_expected_writes, use_ip_from_inside ), "secondary not up to date with the cluster after restarting." -async def check_writes(ops_test) -> int: +async def check_writes(ops_test, use_ip_from_inside: bool = False) -> int: """Gets the total writes from the test charm and compares to the writes from db.""" total_expected_writes = await stop_continuous_writes(ops_test) - actual_writes, max_number_written = await count_writes(ops_test) + actual_writes, max_number_written = await count_writes( + ops_test, use_ip_from_inside=use_ip_from_inside + ) for member, count in actual_writes.items(): assert ( count == max_number_written[member] @@ -197,21 +208,33 @@ async def check_writes(ops_test) -> int: async def count_writes( - ops_test: OpsTest, down_unit: str = None + ops_test: OpsTest, down_unit: str = None, use_ip_from_inside: bool = False ) -> Tuple[Dict[str, int], Dict[str, int]]: """Count the number of writes in the database.""" app = await app_name(ops_test) password = await get_password(ops_test, app, down_unit) for unit in ops_test.model.applications[app].units: if unit.name != down_unit: - cluster = get_patroni_cluster(await get_unit_ip(ops_test, unit.name)) + cluster = get_patroni_cluster( + await ( + get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + ) + ) break down_ips = [] if down_unit: for unit in ops_test.model.applications[app].units: if unit.name == down_unit: down_ips.append(unit.public_address) - down_ips.append(await get_unit_ip(ops_test, unit.name)) + down_ips.append( + await ( + get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + ) + ) count = {} maximum = {} for member in cluster["members"]: @@ -263,16 +286,21 @@ def cut_network_from_unit_without_ip_change(machine_name: str) -> None: subprocess.check_call(limit_set_command.split()) -async def fetch_cluster_members(ops_test: OpsTest): +async def fetch_cluster_members(ops_test: OpsTest, use_ip_from_inside: bool = False): """Fetches the IPs listed by Patroni as cluster members. Args: ops_test: OpsTest instance. + use_ip_from_inside: whether to use the IP from inside the unit. """ app = await app_name(ops_test) member_ips = {} for unit in ops_test.model.applications[app].units: - unit_ip = await get_unit_ip(ops_test, unit.name) + unit_ip = await ( + get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + ) cluster_info = requests.get(f"http://{unit_ip}:8008/cluster") if len(member_ips) > 0: # If the list of members IPs was already fetched, also compare the @@ -304,6 +332,16 @@ async def get_controller_machine(ops_test: OpsTest) -> str: ][0] +async def get_ip_from_inside_the_unit(ops_test: OpsTest, unit_name: str) -> str: + command = f"exec --unit {unit_name} -- hostname -I" + return_code, stdout, stderr = await ops_test.juju(*command.split()) + if return_code != 0: + raise ProcessError( + "Expected command %s to succeed instead it failed: %s %s", command, return_code, stderr + ) + return stdout.splitlines()[0].strip() + + async def get_patroni_setting(ops_test: OpsTest, setting: str) -> Optional[int]: """Get the value of one of the integer Patroni settings. @@ -388,11 +426,17 @@ async def get_unit_ip(ops_test: OpsTest, unit_name: str) -> str: @retry(stop=stop_after_attempt(8), wait=wait_fixed(15), reraise=True) -async def is_connection_possible(ops_test: OpsTest, unit_name: str) -> bool: +async def is_connection_possible( + ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False +) -> bool: """Test a connection to a PostgreSQL server.""" app = unit_name.split("/")[0] password = await get_password(ops_test, app, unit_name) - address = await get_unit_ip(ops_test, unit_name) + address = await ( + get_ip_from_inside_the_unit(ops_test, unit_name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit_name) + ) try: with db_connect( host=address, password=password @@ -420,9 +464,13 @@ def is_machine_reachable_from(origin_machine: str, target_machine: str) -> bool: return False -async def is_replica(ops_test: OpsTest, unit_name: str) -> bool: +async def is_replica(ops_test: OpsTest, unit_name: str, use_ip_from_inside: bool = False) -> bool: """Returns whether the unit a replica in the cluster.""" - unit_ip = await get_unit_ip(ops_test, unit_name) + unit_ip = await ( + get_ip_from_inside_the_unit(ops_test, unit_name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit_name) + ) member_name = unit_name.replace("/", "-") try: @@ -571,7 +619,9 @@ def restore_network_for_unit_without_ip_change(machine_name: str) -> None: subprocess.check_call(limit_set_command.split()) -async def is_secondary_up_to_date(ops_test: OpsTest, unit_name: str, expected_writes: int) -> bool: +async def is_secondary_up_to_date( + ops_test: OpsTest, unit_name: str, expected_writes: int, use_ip_from_inside: bool = False +) -> bool: """Checks if secondary is up-to-date with the cluster. Retries over the period of one minute to give secondary adequate time to copy over data. @@ -579,7 +629,11 @@ async def is_secondary_up_to_date(ops_test: OpsTest, unit_name: str, expected_wr app = await app_name(ops_test) password = await get_password(ops_test, app) host = [ - await get_unit_ip(ops_test, unit.name) + await ( + get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + ) for unit in ops_test.model.applications[app].units if unit.name == unit_name ][0] @@ -679,15 +733,17 @@ async def update_restart_condition(ops_test: OpsTest, unit, condition: str): @retry(stop=stop_after_attempt(20), wait=wait_fixed(30)) -async def wait_network_restore(ops_test: OpsTest, hostname: str, old_ip: str) -> None: +async def wait_network_restore(ops_test: OpsTest, unit_name: str, old_ip: str) -> None: """Wait until network is restored. Args: ops_test: pytest plugin helper - hostname: The name of the instance + unit_name: name of the unit old_ip: old registered IP address """ - if await instance_ip(ops_test, hostname) == old_ip: + # Retrieve the unit IP from inside the unit because it may not be updated in the + # Juju status too quickly. + if (await get_ip_from_inside_the_unit(ops_test, unit_name)) == old_ip: raise Exception diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 83d2166914..0de2a2b79b 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -383,7 +383,6 @@ async def test_forceful_restart_without_data_and_transaction_logs( @pytest.mark.group(1) -@pytest.mark.unstable async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_timeout): """Completely cut and restore network.""" # Locate primary unit. @@ -456,15 +455,15 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t # Wait the LXD unit has its IP updated. logger.info("waiting for IP address to be updated on Juju unit") - await wait_network_restore(ops_test, primary_hostname, primary_ip) + await wait_network_restore(ops_test, primary_name, primary_ip) # Verify that connection is possible. logger.info("checking whether the connectivity to the database is working") assert await is_connection_possible( - ops_test, primary_name + ops_test, primary_name, use_ip_from_inside=True ), "Connection is not possible after network restore" - await is_cluster_updated(ops_test, primary_name) + await is_cluster_updated(ops_test, primary_name, use_ip_from_inside=True) @pytest.mark.group(1) From c8cb4b24bb4e38e76f91f5ce39c6fd4ddf8912c8 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 6 Feb 2024 14:23:33 -0300 Subject: [PATCH 2/8] Fix network cut test without IP change Signed-off-by: Marcelo Henrique Neppel --- src/charm.py | 2 +- tests/integration/ha_tests/helpers.py | 20 +++++++++---------- .../integration/ha_tests/test_self_healing.py | 5 ++--- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/charm.py b/src/charm.py index 14f7e224fc..cd1e347e23 100755 --- a/src/charm.py +++ b/src/charm.py @@ -515,7 +515,7 @@ def _reconfigure_cluster(self, event: HookEvent): and event.relation.data[event.unit].get("ip-to-remove") is not None ): ip_to_remove = event.relation.data[event.unit].get("ip-to-remove") - logger.info("Removing %s from the cluster", ip_to_remove) + logger.info("Removing %s from the cluster due to IP change", ip_to_remove) try: self._patroni.remove_raft_member(ip_to_remove) except RemoveRaftMemberFailedError: diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index f62286b2a1..2daf3a6497 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -77,13 +77,19 @@ async def are_all_db_processes_down(ops_test: OpsTest, process: str) -> bool: return True -async def are_writes_increasing(ops_test, down_unit: str = None) -> None: +async def are_writes_increasing( + ops_test, down_unit: str = None, use_ip_from_inside: bool = False +) -> None: """Verify new writes are continuing by counting the number of writes.""" - writes, _ = await count_writes(ops_test, down_unit=down_unit) + writes, _ = await count_writes( + ops_test, down_unit=down_unit, use_ip_from_inside=use_ip_from_inside + ) for member, count in writes.items(): for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): with attempt: - more_writes, _ = await count_writes(ops_test, down_unit=down_unit) + more_writes, _ = await count_writes( + ops_test, down_unit=down_unit, use_ip_from_inside=use_ip_from_inside + ) assert more_writes[member] > count, f"{member}: writes not continuing to DB" @@ -228,13 +234,7 @@ async def count_writes( for unit in ops_test.model.applications[app].units: if unit.name == down_unit: down_ips.append(unit.public_address) - down_ips.append( - await ( - get_ip_from_inside_the_unit(ops_test, unit.name) - if use_ip_from_inside - else get_unit_ip(ops_test, unit.name) - ) - ) + down_ips.append(await get_unit_ip(ops_test, unit.name)) count = {} maximum = {} for member in cluster["members"]: diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 0de2a2b79b..2a599098c3 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -467,7 +467,6 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t @pytest.mark.group(1) -@pytest.mark.unstable async def test_network_cut_without_ip_change( ops_test: OpsTest, continuous_writes, primary_start_timeout ): @@ -515,7 +514,7 @@ async def test_network_cut_without_ip_change( async with ops_test.fast_forward(): logger.info("checking whether writes are increasing") - await are_writes_increasing(ops_test, primary_name) + await are_writes_increasing(ops_test, primary_name, use_ip_from_inside=True) logger.info("checking whether a new primary was elected") # Verify that a new primary gets elected (ie old primary is secondary). @@ -538,4 +537,4 @@ async def test_network_cut_without_ip_change( ops_test, primary_name ), "Connection is not possible after network restore" - await is_cluster_updated(ops_test, primary_name) + await is_cluster_updated(ops_test, primary_name, use_ip_from_inside=True) From df5c5b90bca2daab877a67e63df9e23c4c90a5f1 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 6 Feb 2024 15:49:45 -0300 Subject: [PATCH 3/8] Update unit test Signed-off-by: Marcelo Henrique Neppel --- tests/unit/test_charm.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 09bfa6f445..4d0d758b82 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1375,20 +1375,36 @@ def test_reconfigure_cluster( # Test when a change is needed in the member IP, but it fails. _remove_raft_member.side_effect = RemoveRaftMemberFailedError _add_members.reset_mock() - mock_event.relation.data = {mock_event.unit: {"ip-to-remove": "1.1.1.1"}} + ip_to_remove = "1.1.1.1" + relation_data = {mock_event.unit: {"ip-to-remove": ip_to_remove}} + mock_event.relation.data = relation_data self.assertFalse(self.charm._reconfigure_cluster(mock_event)) - _remove_raft_member.assert_called_once() + _remove_raft_member.assert_called_once_with(ip_to_remove) _remove_from_members_ips.assert_not_called() _add_members.assert_not_called() - # Test when a change is needed in the member IP and it succeeds. + # Test when a change is needed in the member IP, and it succeeds + # (but the old IP was already been removed). _remove_raft_member.reset_mock() _remove_raft_member.side_effect = None _add_members.reset_mock() - mock_event.relation.data = {mock_event.unit: {"ip-to-remove": "1.1.1.1"}} + mock_event.relation.data = relation_data self.assertTrue(self.charm._reconfigure_cluster(mock_event)) - _remove_raft_member.assert_called_once() - _remove_from_members_ips.assert_called_once() + _remove_raft_member.assert_called_once_with(ip_to_remove) + _remove_from_members_ips.assert_not_called() + _add_members.assert_called_once_with(mock_event) + + # Test when the old IP wasn't removed yet. + _remove_raft_member.reset_mock() + _add_members.reset_mock() + mock_event.relation.data = relation_data + with self.harness.hooks_disabled(): + self.harness.update_relation_data( + self.rel_id, self.charm.app.name, {"members_ips": '["' + ip_to_remove + '"]'} + ) + self.assertTrue(self.charm._reconfigure_cluster(mock_event)) + _remove_raft_member.assert_called_once_with(ip_to_remove) + _remove_from_members_ips.assert_called_once_with(ip_to_remove) _add_members.assert_called_once_with(mock_event) @patch("charms.postgresql_k8s.v0.postgresql_tls.PostgreSQLTLS._request_certificate") From 33825ef161ee2779388f774e1b51ea59b9fa1d19 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 7 Feb 2024 09:16:49 -0300 Subject: [PATCH 4/8] Fix retrieval of units IPs Signed-off-by: Marcelo Henrique Neppel --- tests/integration/ha_tests/helpers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 2daf3a6497..0cee5e3b30 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -184,7 +184,10 @@ async def is_cluster_updated( member_ips = await fetch_cluster_members(ops_test, use_ip_from_inside) app = primary_name.split("/")[0] ip_addresses = [ - await get_unit_ip(ops_test, unit.name) for unit in ops_test.model.applications[app].units + await get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + for unit in ops_test.model.applications[app].units ] assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." From 8a70e27db0d4c293492b458806b6656e1a8d3739 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 7 Feb 2024 09:49:21 -0300 Subject: [PATCH 5/8] Improve checks for readiness Signed-off-by: Marcelo Henrique Neppel --- tests/integration/ha_tests/helpers.py | 18 ++++++++++-------- .../integration/ha_tests/test_self_healing.py | 8 ++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 0cee5e3b30..248bc1cd3f 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -441,14 +441,16 @@ async def is_connection_possible( else get_unit_ip(ops_test, unit_name) ) try: - with db_connect( - host=address, password=password - ) as connection, connection.cursor() as cursor: - cursor.execute("SELECT 1;") - success = cursor.fetchone()[0] == 1 - connection.close() - return success - except psycopg2.Error: + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + with db_connect( + host=address, password=password + ) as connection, connection.cursor() as cursor: + cursor.execute("SELECT 1;") + success = cursor.fetchone()[0] == 1 + connection.close() + return success + except (psycopg2.Error, RetryError): # Error raised when the connection is not possible. return False diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 2a599098c3..675aea7033 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -457,6 +457,10 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t logger.info("waiting for IP address to be updated on Juju unit") await wait_network_restore(ops_test, primary_name, primary_ip) + # Verify that the database service got restarted and is ready in the old primary. + logger.info(f"waiting for the database service to restart on {primary_name}") + assert await is_postgresql_ready(ops_test, primary_name) + # Verify that connection is possible. logger.info("checking whether the connectivity to the database is working") assert await is_connection_possible( @@ -531,6 +535,10 @@ async def test_network_cut_without_ip_change( async with ops_test.fast_forward(): await ops_test.model.wait_for_idle(apps=[app], status="active") + # Verify that the database service got restarted and is ready in the old primary. + logger.info(f"waiting for the database service to restart on {primary_name}") + assert await is_postgresql_ready(ops_test, primary_name) + # Verify that connection is possible. logger.info("checking whether the connectivity to the database is working") assert await is_connection_possible( From 7be78b26ca267414eec28b1154c36d2325d300e9 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 7 Feb 2024 13:42:06 -0300 Subject: [PATCH 6/8] Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel --- tests/integration/ha_tests/helpers.py | 8 ++++++-- tests/integration/ha_tests/test_self_healing.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 248bc1cd3f..85752e767c 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -585,9 +585,13 @@ async def send_signal_to_process( ) -async def is_postgresql_ready(ops_test, unit_name: str) -> bool: +async def is_postgresql_ready(ops_test, unit_name: str, use_ip_from_inside: bool = False) -> bool: """Verifies a PostgreSQL instance is running and available.""" - unit_ip = get_unit_address(ops_test, unit_name) + unit_ip = ( + get_ip_from_inside_the_unit(ops_test, unit_name) + if use_ip_from_inside + else get_unit_address(ops_test, unit_name) + ) try: for attempt in Retrying(stop=stop_after_delay(60 * 5), wait=wait_fixed(3)): with attempt: diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 675aea7033..8ef002fa09 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -459,7 +459,7 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t # Verify that the database service got restarted and is ready in the old primary. logger.info(f"waiting for the database service to restart on {primary_name}") - assert await is_postgresql_ready(ops_test, primary_name) + assert await is_postgresql_ready(ops_test, primary_name, use_ip_from_inside=True) # Verify that connection is possible. logger.info("checking whether the connectivity to the database is working") From e37b856293c201bcdf7ff2342c41d921fa6a8910 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 7 Feb 2024 15:25:26 -0300 Subject: [PATCH 7/8] Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel --- tests/integration/ha_tests/helpers.py | 2 +- tests/integration/ha_tests/test_self_healing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 85752e767c..23edb34f24 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -588,7 +588,7 @@ async def send_signal_to_process( async def is_postgresql_ready(ops_test, unit_name: str, use_ip_from_inside: bool = False) -> bool: """Verifies a PostgreSQL instance is running and available.""" unit_ip = ( - get_ip_from_inside_the_unit(ops_test, unit_name) + (await get_ip_from_inside_the_unit(ops_test, unit_name)) if use_ip_from_inside else get_unit_address(ops_test, unit_name) ) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 8ef002fa09..d73ffa2b21 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -458,7 +458,7 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, primary_start_t await wait_network_restore(ops_test, primary_name, primary_ip) # Verify that the database service got restarted and is ready in the old primary. - logger.info(f"waiting for the database service to restart on {primary_name}") + logger.info(f"waiting for the database service to be ready on {primary_name}") assert await is_postgresql_ready(ops_test, primary_name, use_ip_from_inside=True) # Verify that connection is possible. @@ -536,7 +536,7 @@ async def test_network_cut_without_ip_change( await ops_test.model.wait_for_idle(apps=[app], status="active") # Verify that the database service got restarted and is ready in the old primary. - logger.info(f"waiting for the database service to restart on {primary_name}") + logger.info(f"waiting for the database service to be ready on {primary_name}") assert await is_postgresql_ready(ops_test, primary_name) # Verify that connection is possible. From 53a422aae3cf338843aeb3617c47b9cfc942d043 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 7 Feb 2024 18:49:09 -0300 Subject: [PATCH 8/8] Fix IP retrieval Signed-off-by: Marcelo Henrique Neppel --- poetry.lock | 1 - tests/integration/ha_tests/helpers.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index a16d15edb9..32d8e61f7d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1633,7 +1633,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 23edb34f24..2e35b85f94 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -184,9 +184,11 @@ async def is_cluster_updated( member_ips = await fetch_cluster_members(ops_test, use_ip_from_inside) app = primary_name.split("/")[0] ip_addresses = [ - await get_ip_from_inside_the_unit(ops_test, unit.name) - if use_ip_from_inside - else get_unit_ip(ops_test, unit.name) + await ( + get_ip_from_inside_the_unit(ops_test, unit.name) + if use_ip_from_inside + else get_unit_ip(ops_test, unit.name) + ) for unit in ops_test.model.applications[app].units ] assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."