From 04223a849727f118d9ea077bbb1b32441c798e58 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 17 Oct 2022 20:25:04 -0300 Subject: [PATCH 01/28] Add SST test --- tests/integration/ha_tests/clean-data-dir.sh | 4 + .../integration/ha_tests/test_self_healing.py | 108 +++++++++++++++++- 2 files changed, 110 insertions(+), 2 deletions(-) create mode 100755 tests/integration/ha_tests/clean-data-dir.sh diff --git a/tests/integration/ha_tests/clean-data-dir.sh b/tests/integration/ha_tests/clean-data-dir.sh new file mode 100755 index 0000000000..bd020af8a0 --- /dev/null +++ b/tests/integration/ha_tests/clean-data-dir.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail +rm -rf /var/lib/postgresql/data/pgdata/* \ No newline at end of file diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index c817deb113..9e0fa4e779 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -21,6 +21,7 @@ start_continuous_writes, stop_continuous_writes, ) +from tests.integration.helpers import get_unit_address APP_NAME = METADATA["name"] PATRONI_PROCESS = "/usr/local/bin/patroni" @@ -44,7 +45,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle(status="active", timeout=1000) -@pytest.mark.ha_self_healing_tests +# @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_kill_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -104,7 +105,7 @@ async def test_kill_db_process( ), "secondary not up to date with the cluster after restarting." -@pytest.mark.ha_self_healing_tests +# @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_freeze_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -168,3 +169,106 @@ async def test_freeze_db_process( assert await secondary_up_to_date( ops_test, primary_name, total_expected_writes ), "secondary not up to date with the cluster after restarting." + + +@pytest.mark.ha_self_healing_tests +@pytest.mark.parametrize("process", DB_PROCESSES) +async def test_sst(ops_test: OpsTest, process: str, continuous_writes) -> None: + """The SST test. + + A forceful restart instance with deleted data and without transaction logs (forced clone). + """ + app = await app_name(ops_test) + primary_name = await get_primary(ops_test, app) + + # Start an application that continuously writes data to the database. + await start_continuous_writes(ops_test, app) + + # Change the "master_start_timeout" parameter to speed up the fail-over. + original_master_start_timeout = await get_master_start_timeout(ops_test) + await change_master_start_timeout(ops_test, 0) + + # copy data dir content removal script + await ops_test.juju( + "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" + ) + + await send_signal_to_process(ops_test, primary_name, process, "SIGTERM") + + # data removal run within a script + # so it allow `*` expansion + try: + return_code, stdout, stderr = await ops_test.juju( + "ssh", + primary_name, + "sudo", + "/tmp/clean-data-dir.sh", + ) + print(f"return code: {return_code}") + print(f"stdout: {stdout}") + print(f"stderr: {stderr}") + except Exception as e: + print(str(e)) + print(str(type(e))) + + assert return_code == 0, "❌ Failed to remove data directory" + + # async with ops_test.fast_forward(): + # # Wait for unit switch to maintenance status + # await ops_test.model.block_until( + # lambda: primary.workload_status == "maintenance", + # timeout=5 * 60, + # ) + # + # # Wait for unit switch back to active status, this is where self-healing happens + # await ops_test.model.block_until( + # lambda: primary.workload_status == "active", + # timeout=5 * 60, + # ) + + async with ops_test.fast_forward(): + # Verify new writes are continuing by counting the number of writes before and after a + # 3 minutes wait (this is a little more than the loop wait configuration, that is + # considered to trigger a fail-over after master_start_timeout is changed, and also + # when freezing the DB process it take some more time to trigger the fail-over). + writes = await count_writes(ops_test, primary_name) + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + with attempt: + more_writes = await count_writes(ops_test, primary_name) + assert more_writes > writes, "writes not continuing to DB" + + # Verify that a new primary gets elected (ie old primary is secondary). + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + with attempt: + new_primary_name = await get_primary(ops_test, app) + assert new_primary_name != primary_name + + # Revert the "master_start_timeout" parameter to avoid fail-over again. + await change_master_start_timeout(ops_test, original_master_start_timeout) + + # Verify that the database service got restarted and is ready in the old primary. + assert await postgresql_ready(ops_test, primary_name) + + # Verify that the old primary is now a replica. + assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster." + + # Verify that all units are part of the same cluster. + member_ips = await fetch_cluster_members(ops_test) + ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] + assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." + + # Verify that no writes to the database were missed after stopping the writes. + total_expected_writes = await stop_continuous_writes(ops_test) + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + actual_writes = await count_writes(ops_test) + assert total_expected_writes == actual_writes, "writes to the db were missed." + + # Verify that old primary is up-to-date. + assert await secondary_up_to_date( + ops_test, primary_name, total_expected_writes + ), "secondary not up to date with the cluster after restarting." + + # verify instance is part of the cluster + cluster_members = await fetch_cluster_members(ops_test) + assert get_unit_address(ops_test, primary_name) in cluster_members From ebc48141abf5eccab7a69b80e08d46d06fe5b74b Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 17 Oct 2022 20:50:09 -0300 Subject: [PATCH 02/28] Enable previous tests --- tests/integration/ha_tests/test_self_healing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 9e0fa4e779..0fe0e330df 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -45,7 +45,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle(status="active", timeout=1000) -# @pytest.mark.ha_self_healing_tests +@pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_kill_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -105,7 +105,7 @@ async def test_kill_db_process( ), "secondary not up to date with the cluster after restarting." -# @pytest.mark.ha_self_healing_tests +@pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_freeze_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout From 84fca0c6d6658f8764ee817b4a7762563c74788f Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 12:48:24 +0100 Subject: [PATCH 03/28] fix early tls deployment by only reloading patroni config if it's already running --- src/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cluster.py b/src/cluster.py index 5c2a82543a..0acbd0e8c7 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -380,7 +380,8 @@ def remove_raft_member(self, member_ip: str) -> None: @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def reload_patroni_configuration(self): """Reload Patroni configuration after it was changed.""" - requests.post(f"{self._patroni_url}/reload", verify=self.verify) + if service_running(PATRONI_SERVICE): + requests.post(f"{self._patroni_url}/reload", verify=self.verify) def restart_patroni(self) -> bool: """Restart Patroni. From d5d0ad85953bb9b2512bde63098a7c222f049319 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 18 Oct 2022 09:42:35 -0300 Subject: [PATCH 04/28] Improve code --- tests/integration/ha_tests/clean-data-dir.sh | 2 +- .../integration/ha_tests/test_self_healing.py | 51 ++++++------------- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/tests/integration/ha_tests/clean-data-dir.sh b/tests/integration/ha_tests/clean-data-dir.sh index bd020af8a0..c5c714405a 100755 --- a/tests/integration/ha_tests/clean-data-dir.sh +++ b/tests/integration/ha_tests/clean-data-dir.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash set -Eeuo pipefail -rm -rf /var/lib/postgresql/data/pgdata/* \ No newline at end of file +rm -rf /var/lib/postgresql/data/pgdata/* diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 0fe0e330df..704741ee76 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -173,7 +173,9 @@ async def test_freeze_db_process( @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) -async def test_sst(ops_test: OpsTest, process: str, continuous_writes) -> None: +async def test_sst( + ops_test: OpsTest, process: str, continuous_writes, master_start_timeout +) -> None: """The SST test. A forceful restart instance with deleted data and without transaction logs (forced clone). @@ -188,43 +190,22 @@ async def test_sst(ops_test: OpsTest, process: str, continuous_writes) -> None: original_master_start_timeout = await get_master_start_timeout(ops_test) await change_master_start_timeout(ops_test, 0) - # copy data dir content removal script + # Copy data dir content removal script. await ops_test.juju( "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" ) - await send_signal_to_process(ops_test, primary_name, process, "SIGTERM") - - # data removal run within a script - # so it allow `*` expansion - try: - return_code, stdout, stderr = await ops_test.juju( - "ssh", - primary_name, - "sudo", - "/tmp/clean-data-dir.sh", - ) - print(f"return code: {return_code}") - print(f"stdout: {stdout}") - print(f"stderr: {stderr}") - except Exception as e: - print(str(e)) - print(str(type(e))) - - assert return_code == 0, "❌ Failed to remove data directory" - - # async with ops_test.fast_forward(): - # # Wait for unit switch to maintenance status - # await ops_test.model.block_until( - # lambda: primary.workload_status == "maintenance", - # timeout=5 * 60, - # ) - # - # # Wait for unit switch back to active status, this is where self-healing happens - # await ops_test.model.block_until( - # lambda: primary.workload_status == "active", - # timeout=5 * 60, - # ) + # Force a restart of the database process. + await send_signal_to_process(ops_test, primary_name, process, "SIGKILL") + + # Data removal runs within a script, so it allows `*` expansion. + return_code, _, _ = await ops_test.juju( + "ssh", + primary_name, + "sudo", + "/tmp/clean-data-dir.sh", + ) + assert return_code == 0, "Failed to remove data directory" async with ops_test.fast_forward(): # Verify new writes are continuing by counting the number of writes before and after a @@ -269,6 +250,6 @@ async def test_sst(ops_test: OpsTest, process: str, continuous_writes) -> None: ops_test, primary_name, total_expected_writes ), "secondary not up to date with the cluster after restarting." - # verify instance is part of the cluster + # Verify instance is part of the cluster cluster_members = await fetch_cluster_members(ops_test) assert get_unit_address(ops_test, primary_name) in cluster_members From 1e64d31c1d6a65822671c9569271c4b0cd3c18c0 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 18 Oct 2022 09:43:57 -0300 Subject: [PATCH 05/28] Remove duplicate check --- tests/integration/ha_tests/test_self_healing.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 704741ee76..ac43a6f05c 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -249,7 +249,3 @@ async def test_sst( assert await secondary_up_to_date( ops_test, primary_name, total_expected_writes ), "secondary not up to date with the cluster after restarting." - - # Verify instance is part of the cluster - cluster_members = await fetch_cluster_members(ops_test) - assert get_unit_address(ops_test, primary_name) in cluster_members From 4688a95d9eca07e2efa3bb247ed06c02fa436a58 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 18 Oct 2022 09:46:09 -0300 Subject: [PATCH 06/28] Remove unused import --- tests/integration/ha_tests/test_self_healing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index ac43a6f05c..48009055ae 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -21,7 +21,6 @@ start_continuous_writes, stop_continuous_writes, ) -from tests.integration.helpers import get_unit_address APP_NAME = METADATA["name"] PATRONI_PROCESS = "/usr/local/bin/patroni" From 2b76d2ee5537080fc3da1e41cb7aa08c8ff1a78a Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 13:52:49 +0100 Subject: [PATCH 07/28] added unit test for reloading patroni --- src/cluster.py | 3 ++- tests/unit/test_cluster.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/cluster.py b/src/cluster.py index 0acbd0e8c7..25b500102c 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -395,4 +395,5 @@ def restart_patroni(self) -> bool: @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def restart_postgresql(self) -> None: """Restart PostgreSQL.""" - requests.post(f"{self._patroni_url}/restart", verify=self.verify) + if service_running(PATRONI_SERVICE): + requests.post(f"{self._patroni_url}/restart", verify=self.verify) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 19310ae3a4..1617a308d7 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -276,3 +276,15 @@ def test_start_patroni(self, _create_directory, _service_running, _service_start # Test a fail scenario. success = self.patroni.start_patroni() assert not success + + + @patch("cluster.service_running") + @patch("requests.post") + def test_reload_patroni_configuration(self, _post, _service_running): + _service_running.side_effect = [False, True] + + self.patroni.reload_patroni_configuration() + _post.assert_not_called() + + self.patroni.reload_patroni_configuration() + _post.assert_called() \ No newline at end of file From edcfb0c91b4abab64014ae30421aeaae7b759e8d Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 13:54:05 +0100 Subject: [PATCH 08/28] lint --- tests/unit/test_cluster.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index 1617a308d7..b665fd7e89 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -277,7 +277,6 @@ def test_start_patroni(self, _create_directory, _service_running, _service_start success = self.patroni.start_patroni() assert not success - @patch("cluster.service_running") @patch("requests.post") def test_reload_patroni_configuration(self, _post, _service_running): @@ -287,4 +286,4 @@ def test_reload_patroni_configuration(self, _post, _service_running): _post.assert_not_called() self.patroni.reload_patroni_configuration() - _post.assert_called() \ No newline at end of file + _post.assert_called() From 51cee095e90441a84dfd4c6d0246427a18fbb41d Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 14:00:12 +0100 Subject: [PATCH 09/28] removing postgres restart check --- src/cluster.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cluster.py b/src/cluster.py index 25b500102c..0acbd0e8c7 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -395,5 +395,4 @@ def restart_patroni(self) -> bool: @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def restart_postgresql(self) -> None: """Restart PostgreSQL.""" - if service_running(PATRONI_SERVICE): - requests.post(f"{self._patroni_url}/restart", verify=self.verify) + requests.post(f"{self._patroni_url}/restart", verify=self.verify) From f05a540f91fc9bbaa159bf5180414e8828482a0d Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 18 Oct 2022 10:09:11 -0300 Subject: [PATCH 10/28] Pin Juju agent version on CI --- .github/workflows/ci.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9b8705d8b1..0a37b1a80b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,6 +57,8 @@ jobs: uses: charmed-kubernetes/actions-operator@main with: provider: lxd + # This is needed until https://bugs.launchpad.net/juju/+bug/1992833 is fixed. + bootstrap-options: "--agent-version 2.9.34" - name: Run integration tests run: tox -e database-relation-integration @@ -105,6 +107,8 @@ jobs: uses: charmed-kubernetes/actions-operator@main with: provider: lxd + # This is needed until https://bugs.launchpad.net/juju/+bug/1992833 is fixed. + bootstrap-options: "--agent-version 2.9.34" - name: Run integration tests run: tox -e ha-self-healing-integration From 5a97730e7539703e5aed45e4ee254444089d3f49 Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 14:38:23 +0100 Subject: [PATCH 11/28] adding series flags to test apps --- tests/integration/ha_tests/application-charm/metadata.yaml | 1 + tests/integration/new_relations/application-charm/metadata.yaml | 1 + tests/integration/new_relations/test_new_relations.py | 2 ++ tests/integration/test_db_admin.py | 1 + 4 files changed, 5 insertions(+) diff --git a/tests/integration/ha_tests/application-charm/metadata.yaml b/tests/integration/ha_tests/application-charm/metadata.yaml index d9ba6ef6f3..b8f8c7f8fa 100644 --- a/tests/integration/ha_tests/application-charm/metadata.yaml +++ b/tests/integration/ha_tests/application-charm/metadata.yaml @@ -6,6 +6,7 @@ description: | summary: | Data platform libs application meant to be used only for testing high availability of the PostgreSQL charm. +series: focal requires: database: diff --git a/tests/integration/new_relations/application-charm/metadata.yaml b/tests/integration/new_relations/application-charm/metadata.yaml index 4ada1e1e14..df1c79c3c4 100644 --- a/tests/integration/new_relations/application-charm/metadata.yaml +++ b/tests/integration/new_relations/application-charm/metadata.yaml @@ -6,6 +6,7 @@ description: | summary: | Data platform libs application meant to be used only for testing of the libs in this repository. +series: focal requires: first-database: interface: postgresql_client diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py index 93a9b090ff..dea16de2ab 100644 --- a/tests/integration/new_relations/test_new_relations.py +++ b/tests/integration/new_relations/test_new_relations.py @@ -48,6 +48,7 @@ async def test_deploy_charms(ops_test: OpsTest, application_charm, database_char resources={"patroni": "patroni.tar.gz"}, application_name=DATABASE_APP_NAME, num_units=1, + series="focal", trust=True, ), ops_test.model.deploy( @@ -55,6 +56,7 @@ async def test_deploy_charms(ops_test: OpsTest, application_charm, database_char resources={"patroni": "patroni.tar.gz"}, application_name=ANOTHER_DATABASE_APP_NAME, num_units=2, + series="focal", trust=True, ), ) diff --git a/tests/integration/test_db_admin.py b/tests/integration/test_db_admin.py index d8b29fb2ca..55d4215eb1 100644 --- a/tests/integration/test_db_admin.py +++ b/tests/integration/test_db_admin.py @@ -37,6 +37,7 @@ async def test_landscape_scalable_bundle_db(ops_test: OpsTest, charm: str) -> No config=config, resources=resources, application_name=DATABASE_APP_NAME, + series="focal", num_units=DATABASE_UNITS, ) # Attach the resource to the controller. From 072252a79d433dea95848d6acb2e7117dfa26fd7 Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 14:38:47 +0100 Subject: [PATCH 12/28] adding series flags to test apps --- tests/integration/test_db_admin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_db_admin.py b/tests/integration/test_db_admin.py index 55d4215eb1..d8b29fb2ca 100644 --- a/tests/integration/test_db_admin.py +++ b/tests/integration/test_db_admin.py @@ -37,7 +37,6 @@ async def test_landscape_scalable_bundle_db(ops_test: OpsTest, charm: str) -> No config=config, resources=resources, application_name=DATABASE_APP_NAME, - series="focal", num_units=DATABASE_UNITS, ) # Attach the resource to the controller. From f340ccfe395b4db1821771a9fc34b614621d90a7 Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 15:03:30 +0100 Subject: [PATCH 13/28] made series into a list --- tests/integration/ha_tests/application-charm/metadata.yaml | 3 ++- .../integration/new_relations/application-charm/metadata.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha_tests/application-charm/metadata.yaml b/tests/integration/ha_tests/application-charm/metadata.yaml index b8f8c7f8fa..768d3260e7 100644 --- a/tests/integration/ha_tests/application-charm/metadata.yaml +++ b/tests/integration/ha_tests/application-charm/metadata.yaml @@ -6,7 +6,8 @@ description: | summary: | Data platform libs application meant to be used only for testing high availability of the PostgreSQL charm. -series: focal +series: + - focal requires: database: diff --git a/tests/integration/new_relations/application-charm/metadata.yaml b/tests/integration/new_relations/application-charm/metadata.yaml index df1c79c3c4..bb31edfc85 100644 --- a/tests/integration/new_relations/application-charm/metadata.yaml +++ b/tests/integration/new_relations/application-charm/metadata.yaml @@ -6,7 +6,8 @@ description: | summary: | Data platform libs application meant to be used only for testing of the libs in this repository. -series: focal +series: + - focal requires: first-database: interface: postgresql_client From b537e8293a782581796d52cbda437f81d7b3a30e Mon Sep 17 00:00:00 2001 From: Will Fitch Date: Tue, 18 Oct 2022 16:55:48 +0100 Subject: [PATCH 14/28] Update test_new_relations.py --- tests/integration/new_relations/test_new_relations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/new_relations/test_new_relations.py b/tests/integration/new_relations/test_new_relations.py index dea16de2ab..93a9b090ff 100644 --- a/tests/integration/new_relations/test_new_relations.py +++ b/tests/integration/new_relations/test_new_relations.py @@ -48,7 +48,6 @@ async def test_deploy_charms(ops_test: OpsTest, application_charm, database_char resources={"patroni": "patroni.tar.gz"}, application_name=DATABASE_APP_NAME, num_units=1, - series="focal", trust=True, ), ops_test.model.deploy( @@ -56,7 +55,6 @@ async def test_deploy_charms(ops_test: OpsTest, application_charm, database_char resources={"patroni": "patroni.tar.gz"}, application_name=ANOTHER_DATABASE_APP_NAME, num_units=2, - series="focal", trust=True, ), ) From 6c0db4b6118765cd5457809c7d81285c70ad01d0 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 18 Oct 2022 16:00:53 -0300 Subject: [PATCH 15/28] Add retrying --- tests/integration/ha_tests/test_self_healing.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 48009055ae..92577b2aa2 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -76,9 +76,11 @@ async def test_kill_db_process( # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) - # Verify that a new primary gets elected (ie old primary is secondary). - new_primary_name = await get_primary(ops_test, app) - assert new_primary_name != primary_name + # Verify that a new primary gets elected (ie old primary is secondary). + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + with attempt: + new_primary_name = await get_primary(ops_test, app) + assert new_primary_name != primary_name # Revert the "master_start_timeout" parameter to avoid fail-over again. await change_master_start_timeout(ops_test, original_master_start_timeout) From 60bc8e21b240f84c4e87b44a021bb9b981e7dbf7 Mon Sep 17 00:00:00 2001 From: WRFitch Date: Tue, 18 Oct 2022 23:21:01 +0100 Subject: [PATCH 16/28] updating test to better emulate bundle deploymen --- tests/integration/test_tls.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_tls.py b/tests/integration/test_tls.py index 764d1506ef..d823d5e5eb 100644 --- a/tests/integration/test_tls.py +++ b/tests/integration/test_tls.py @@ -26,7 +26,8 @@ async def test_deploy_active(ops_test: OpsTest): charm, resources={"patroni": "patroni.tar.gz"}, application_name=APP_NAME, num_units=3 ) await ops_test.juju("attach-resource", APP_NAME, "patroni=patroni.tar.gz") - await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) + # No wait between deploying charms, since we can't guarantee users will wait. Furthermore, + # bundles don't wait between deploying charms. @pytest.mark.tls_tests @@ -36,10 +37,6 @@ async def test_tls_enabled(ops_test: OpsTest) -> None: # Deploy TLS Certificates operator. config = {"generate-self-signed-certificates": "true", "ca-common-name": "Test CA"} await ops_test.model.deploy(TLS_CERTIFICATES_APP_NAME, channel="edge", config=config) - await ops_test.model.wait_for_idle( - apps=[TLS_CERTIFICATES_APP_NAME], status="active", timeout=1000 - ) - # Relate it to the PostgreSQL to enable TLS. await ops_test.model.relate(DATABASE_APP_NAME, TLS_CERTIFICATES_APP_NAME) await ops_test.model.wait_for_idle(status="active", timeout=1000) From f73aefcace1c7fe21f9eb6d458b8bd6826ed0244 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 16 Nov 2022 16:08:29 -0300 Subject: [PATCH 17/28] Remove unused code --- src/cluster.py | 3 +-- .../ha_tests/application-charm/metadata.yaml | 2 -- .../new_relations/application-charm/metadata.yaml | 2 -- tests/unit/test_cluster.py | 11 ----------- 4 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/cluster.py b/src/cluster.py index 0acbd0e8c7..5c2a82543a 100644 --- a/src/cluster.py +++ b/src/cluster.py @@ -380,8 +380,7 @@ def remove_raft_member(self, member_ip: str) -> None: @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) def reload_patroni_configuration(self): """Reload Patroni configuration after it was changed.""" - if service_running(PATRONI_SERVICE): - requests.post(f"{self._patroni_url}/reload", verify=self.verify) + requests.post(f"{self._patroni_url}/reload", verify=self.verify) def restart_patroni(self) -> bool: """Restart Patroni. diff --git a/tests/integration/ha_tests/application-charm/metadata.yaml b/tests/integration/ha_tests/application-charm/metadata.yaml index 768d3260e7..d9ba6ef6f3 100644 --- a/tests/integration/ha_tests/application-charm/metadata.yaml +++ b/tests/integration/ha_tests/application-charm/metadata.yaml @@ -6,8 +6,6 @@ description: | summary: | Data platform libs application meant to be used only for testing high availability of the PostgreSQL charm. -series: - - focal requires: database: diff --git a/tests/integration/new_relations/application-charm/metadata.yaml b/tests/integration/new_relations/application-charm/metadata.yaml index bb31edfc85..4ada1e1e14 100644 --- a/tests/integration/new_relations/application-charm/metadata.yaml +++ b/tests/integration/new_relations/application-charm/metadata.yaml @@ -6,8 +6,6 @@ description: | summary: | Data platform libs application meant to be used only for testing of the libs in this repository. -series: - - focal requires: first-database: interface: postgresql_client diff --git a/tests/unit/test_cluster.py b/tests/unit/test_cluster.py index b665fd7e89..19310ae3a4 100644 --- a/tests/unit/test_cluster.py +++ b/tests/unit/test_cluster.py @@ -276,14 +276,3 @@ def test_start_patroni(self, _create_directory, _service_running, _service_start # Test a fail scenario. success = self.patroni.start_patroni() assert not success - - @patch("cluster.service_running") - @patch("requests.post") - def test_reload_patroni_configuration(self, _post, _service_running): - _service_running.side_effect = [False, True] - - self.patroni.reload_patroni_configuration() - _post.assert_not_called() - - self.patroni.reload_patroni_configuration() - _post.assert_called() From c539f7893a4d95e17ee1f8a977e8b2240433cc4f Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Wed, 16 Nov 2022 16:46:34 -0300 Subject: [PATCH 18/28] Change processes list --- tests/integration/ha_tests/test_self_healing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 92577b2aa2..bcf28bda2a 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -173,7 +173,7 @@ async def test_freeze_db_process( @pytest.mark.ha_self_healing_tests -@pytest.mark.parametrize("process", DB_PROCESSES) +@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) async def test_sst( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout ) -> None: From ed987ed6b8a96ea2adaa56e77fe4dac561b39d8f Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Fri, 18 Nov 2022 11:21:15 -0300 Subject: [PATCH 19/28] Add logic for ensuring all units down --- tests/integration/ha_tests/conftest.py | 11 +++ tests/integration/ha_tests/helpers.py | 75 ++++++++++++++++++- .../integration/ha_tests/test_self_healing.py | 42 ++++++++--- 3 files changed, 115 insertions(+), 13 deletions(-) diff --git a/tests/integration/ha_tests/conftest.py b/tests/integration/ha_tests/conftest.py index 5bc999784b..030675ab3a 100644 --- a/tests/integration/ha_tests/conftest.py +++ b/tests/integration/ha_tests/conftest.py @@ -5,9 +5,11 @@ from pytest_operator.plugin import OpsTest from tests.integration.ha_tests.helpers import ( + ORIGINAL_RESTART_DELAY, app_name, change_master_start_timeout, get_master_start_timeout, + update_restart_delay, ) APPLICATION_NAME = "application" @@ -40,3 +42,12 @@ async def master_start_timeout(ops_test: OpsTest) -> None: yield # Rollback to the initial configuration. await change_master_start_timeout(ops_test, initial_master_start_timeout) + + +@pytest.fixture() +async def reset_restart_delay(ops_test: OpsTest): + """Resets service file delay on all units.""" + yield + app = await app_name(ops_test) + for unit in ops_test.model.applications[app].units: + await update_restart_delay(ops_test, unit, ORIGINAL_RESTART_DELAY) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index c9d7ad31a4..d4742bee9a 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1,5 +1,6 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import subprocess from pathlib import Path from typing import Optional @@ -14,6 +15,10 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) PORT = 5432 APP_NAME = METADATA["name"] +PATRONI_SERVICE_DEFAULT_PATH = "/etc/systemd/system/patroni.service" +TMP_SERVICE_PATH = "tests/integration/ha_tests/tmp.service" +RESTART_DELAY = 60 * 3 +ORIGINAL_RESTART_DELAY = 30 class MemberNotListedOnClusterError(Exception): @@ -25,7 +30,37 @@ class MemberNotUpdatedOnClusterError(Exception): class ProcessError(Exception): - pass + """Raised when a process fails.""" + + +class ProcessRunningError(Exception): + """Raised when a process is running when it is not expected to be.""" + + +async def all_db_processes_down(ops_test: OpsTest, process: str) -> bool: + """Verifies that all units of the charm do not have the DB process running.""" + app = await app_name(ops_test) + + try: + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + for unit in ops_test.model.applications[app].units: + search_db_process = f'run --unit {unit.name} ps ax | grep "{process} "' + _, processes, _ = await ops_test.juju(*search_db_process.split()) + + # `ps ax | grep "{DB_PROCESS} "` is a process on its own and will be shown in + # the output of ps aux, hence it is important that we check if there is + # more than one process containing the name `DB_PROCESS` + # splitting processes by "\n" results in one or more empty lines, hence we + # need to process these lines accordingly. + processes = [proc for proc in processes.split("\n") if len(proc) > 0] + + if len(processes) > 1: + raise ProcessRunningError + except RetryError: + return False + + return True async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> Optional[str]: @@ -286,3 +321,41 @@ async def stop_continuous_writes(ops_test: OpsTest) -> int: ) action = await action.wait() return int(action.results["writes"]) + + +async def update_restart_delay(ops_test: OpsTest, unit, delay: int): + """Updates the restart delay in the DB service file. + + When the DB service fails it will now wait for `delay` number of seconds. + """ + # Load the service file from the unit and update it with the new delay. + await unit.scp_from(source=PATRONI_SERVICE_DEFAULT_PATH, destination=TMP_SERVICE_PATH) + with open(TMP_SERVICE_PATH, "r") as patroni_service_file: + patroni_service = patroni_service_file.readlines() + + for index, line in enumerate(patroni_service): + if "RestartSec" in line: + patroni_service[index] = f"RestartSec={delay}s\n" + + with open(TMP_SERVICE_PATH, "w") as service_file: + service_file.writelines(patroni_service) + + # Upload the changed file back to the unit, we cannot scp this file directly to + # PATRONI_SERVICE_DEFAULT_PATH since this directory has strict permissions, instead we scp it + # elsewhere and then move it to PATRONI_SERVICE_DEFAULT_PATH. + await unit.scp_to(source=TMP_SERVICE_PATH, destination="patroni.service") + mv_cmd = ( + f"run --unit {unit.name} mv /home/ubuntu/patroni.service {PATRONI_SERVICE_DEFAULT_PATH}" + ) + return_code, _, _ = await ops_test.juju(*mv_cmd.split()) + if return_code != 0: + raise ProcessError("Command: %s failed on unit: %s.", mv_cmd, unit.name) + + # Remove temporary file from machine. + subprocess.call(["rm", TMP_SERVICE_PATH]) + + # Reload the daemon for systemd otherwise changes are not saved. + reload_cmd = f"run --unit {unit.name} systemctl daemon-reload" + return_code, _, _ = await ops_test.juju(*reload_cmd.split()) + if return_code != 0: + raise ProcessError("Command: %s failed on unit: %s.", reload_cmd, unit.name) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index bcf28bda2a..2dc8851402 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import pytest from pytest_operator.plugin import OpsTest @@ -8,6 +9,8 @@ from tests.integration.ha_tests.helpers import ( METADATA, + RESTART_DELAY, + all_db_processes_down, app_name, change_master_start_timeout, count_writes, @@ -20,6 +23,7 @@ send_signal_to_process, start_continuous_writes, stop_continuous_writes, + update_restart_delay, ) APP_NAME = METADATA["name"] @@ -44,7 +48,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle(status="active", timeout=1000) -@pytest.mark.ha_self_healing_tests +# @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_kill_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -106,7 +110,7 @@ async def test_kill_db_process( ), "secondary not up to date with the cluster after restarting." -@pytest.mark.ha_self_healing_tests +# @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_freeze_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -184,6 +188,11 @@ async def test_sst( app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) + # Copy data dir content removal script. + await ops_test.juju( + "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" + ) + # Start an application that continuously writes data to the database. await start_continuous_writes(ops_test, app) @@ -191,13 +200,23 @@ async def test_sst( original_master_start_timeout = await get_master_start_timeout(ops_test) await change_master_start_timeout(ops_test, 0) - # Copy data dir content removal script. - await ops_test.juju( - "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" + # Update all units to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes + # should ensure enough time for all replicas to be down at the same time. + for unit in ops_test.model.applications[app].units: + await update_restart_delay(ops_test, unit, RESTART_DELAY) + + # Restart all units "simultaneously". + await asyncio.gather( + *[ + send_signal_to_process(ops_test, unit.name, process, kill_code="SIGTERM") + for unit in ops_test.model.applications[app].units + ] ) - # Force a restart of the database process. - await send_signal_to_process(ops_test, primary_name, process, "SIGKILL") + # This test serves to verify behavior when all replicas are down at the same time that when + # they come back online they operate as expected. This check verifies that we meet the criteria + # of all replicas being down at the same time. + assert await all_db_processes_down(ops_test, process), "Not all units down at the same time." # Data removal runs within a script, so it allows `*` expansion. return_code, _, _ = await ops_test.juju( @@ -210,17 +229,16 @@ async def test_sst( async with ops_test.fast_forward(): # Verify new writes are continuing by counting the number of writes before and after a - # 3 minutes wait (this is a little more than the loop wait configuration, that is - # considered to trigger a fail-over after master_start_timeout is changed, and also - # when freezing the DB process it take some more time to trigger the fail-over). + # 60 seconds wait (this is a little more than the loop wait configuration, that is + # considered to trigger a fail-over after master_start_timeout is changed). writes = await count_writes(ops_test, primary_name) - for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: more_writes = await count_writes(ops_test, primary_name) assert more_writes > writes, "writes not continuing to DB" # Verify that a new primary gets elected (ie old primary is secondary). - for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: new_primary_name = await get_primary(ops_test, app) assert new_primary_name != primary_name From b1c8c2cb1f413e226d07b597bc45f5df27ff73b2 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Fri, 18 Nov 2022 14:15:57 -0300 Subject: [PATCH 20/28] Change delay to only one unit --- tests/integration/ha_tests/helpers.py | 32 +------------------ .../integration/ha_tests/test_self_healing.py | 20 +++--------- 2 files changed, 6 insertions(+), 46 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index d4742bee9a..d23d69a6b5 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -30,37 +30,7 @@ class MemberNotUpdatedOnClusterError(Exception): class ProcessError(Exception): - """Raised when a process fails.""" - - -class ProcessRunningError(Exception): - """Raised when a process is running when it is not expected to be.""" - - -async def all_db_processes_down(ops_test: OpsTest, process: str) -> bool: - """Verifies that all units of the charm do not have the DB process running.""" - app = await app_name(ops_test) - - try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): - with attempt: - for unit in ops_test.model.applications[app].units: - search_db_process = f'run --unit {unit.name} ps ax | grep "{process} "' - _, processes, _ = await ops_test.juju(*search_db_process.split()) - - # `ps ax | grep "{DB_PROCESS} "` is a process on its own and will be shown in - # the output of ps aux, hence it is important that we check if there is - # more than one process containing the name `DB_PROCESS` - # splitting processes by "\n" results in one or more empty lines, hence we - # need to process these lines accordingly. - processes = [proc for proc in processes.split("\n") if len(proc) > 0] - - if len(processes) > 1: - raise ProcessRunningError - except RetryError: - return False - - return True + pass async def app_name(ops_test: OpsTest, application_name: str = "postgresql") -> Optional[str]: diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 2dc8851402..fba86d5387 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -import asyncio import pytest from pytest_operator.plugin import OpsTest @@ -10,7 +9,6 @@ from tests.integration.ha_tests.helpers import ( METADATA, RESTART_DELAY, - all_db_processes_down, app_name, change_master_start_timeout, count_writes, @@ -179,7 +177,7 @@ async def test_freeze_db_process( @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) async def test_sst( - ops_test: OpsTest, process: str, continuous_writes, master_start_timeout + ops_test: OpsTest, process: str, continuous_writes, master_start_timeout, reset_restart_delay ) -> None: """The SST test. @@ -203,20 +201,12 @@ async def test_sst( # Update all units to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes # should ensure enough time for all replicas to be down at the same time. for unit in ops_test.model.applications[app].units: - await update_restart_delay(ops_test, unit, RESTART_DELAY) + if unit.name == primary_name: + await update_restart_delay(ops_test, unit, RESTART_DELAY) + break # Restart all units "simultaneously". - await asyncio.gather( - *[ - send_signal_to_process(ops_test, unit.name, process, kill_code="SIGTERM") - for unit in ops_test.model.applications[app].units - ] - ) - - # This test serves to verify behavior when all replicas are down at the same time that when - # they come back online they operate as expected. This check verifies that we meet the criteria - # of all replicas being down at the same time. - assert await all_db_processes_down(ops_test, process), "Not all units down at the same time." + await send_signal_to_process(ops_test, primary_name, process, kill_code="SIGTERM") # Data removal runs within a script, so it allows `*` expansion. return_code, _, _ = await ops_test.juju( From 8e507a0b865b9a1cc6ed60dc8a6585cfcd6da188 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Fri, 18 Nov 2022 16:50:40 -0300 Subject: [PATCH 21/28] Add WAL switch --- tests/integration/ha_tests/helpers.py | 26 +++++++++++---- .../integration/ha_tests/test_self_healing.py | 33 ++++++++++++++----- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index d23d69a6b5..797f025060 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -183,13 +183,27 @@ async def get_primary(ops_test: OpsTest, app) -> str: """Use the charm action to retrieve the primary from provided application. Returns: - string with the password stored on the peer relation databag. + primary unit name. """ - # Can retrieve from any unit running unit, so we pick the first. - unit_name = ops_test.model.applications[app].units[0].name - action = await ops_test.model.units.get(unit_name).run_action("get-primary") - action = await action.wait() - return action.results["primary"] + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + # Can retrieve from any unit running unit, so we pick the first. + unit_name = ops_test.model.applications[app].units[0].name + action = await ops_test.model.units.get(unit_name).run_action("get-primary") + action = await action.wait() + assert action.results["primary"] is not None + return action.results["primary"] + + +async def list_wal_files(ops_test: OpsTest, app: str): + units = [unit.name for unit in ops_test.model.applications[app].units] + command = "ls -al /var/lib/postgresql/data/pgdata/pg_wal/" + for unit in units: + print(f"unit name: {unit}") + complete_command = f"run --unit {unit} -- {command}" + return_code, stdout, stderr = await ops_test.juju(*complete_command.split()) + print(f"return_code: {return_code}") + print(f"stdout: {stdout}") + print(f"stderr: {stderr}") async def send_signal_to_process( diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index fba86d5387..03a377efff 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -16,6 +16,7 @@ get_master_start_timeout, get_primary, is_replica, + list_wal_files, postgresql_ready, secondary_up_to_date, send_signal_to_process, @@ -23,6 +24,7 @@ stop_continuous_writes, update_restart_delay, ) +from tests.integration.helpers import db_connect, get_password, get_unit_address APP_NAME = METADATA["name"] PATRONI_PROCESS = "/usr/local/bin/patroni" @@ -218,6 +220,15 @@ async def test_sst( assert return_code == 0, "Failed to remove data directory" async with ops_test.fast_forward(): + # Verify that a new primary gets elected (ie old primary is secondary). + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + new_primary_name = await get_primary(ops_test, app) + assert new_primary_name != primary_name + + # Revert the "master_start_timeout" parameter to avoid fail-over again. + await change_master_start_timeout(ops_test, original_master_start_timeout) + # Verify new writes are continuing by counting the number of writes before and after a # 60 seconds wait (this is a little more than the loop wait configuration, that is # considered to trigger a fail-over after master_start_timeout is changed). @@ -227,14 +238,20 @@ async def test_sst( more_writes = await count_writes(ops_test, primary_name) assert more_writes > writes, "writes not continuing to DB" - # Verify that a new primary gets elected (ie old primary is secondary). - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): - with attempt: - new_primary_name = await get_primary(ops_test, app) - assert new_primary_name != primary_name - - # Revert the "master_start_timeout" parameter to avoid fail-over again. - await change_master_start_timeout(ops_test, original_master_start_timeout) + # Write some data to the initial primary (this causes a divergence + # in the instances' timelines). + await list_wal_files(ops_test, app) + print(f"primary_name: {primary_name}") + print(f"new_primary_name: {new_primary_name}") + host = get_unit_address(ops_test, new_primary_name) + password = await get_password(ops_test, new_primary_name) + with db_connect(host, password) as connection: + connection.autocommit = True + with connection.cursor() as cursor: + cursor.execute("SELECT pg_switch_wal();") + connection.close() + print("...") + await list_wal_files(ops_test, app) # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) From 107e24cb7ad386aeac92bb65049722d885671c17 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 28 Nov 2022 14:33:56 -0300 Subject: [PATCH 22/28] Updates related to WAL removal --- tests/integration/ha_tests/helpers.py | 44 ++++++++++++++--- .../integration/ha_tests/test_self_healing.py | 49 +++++++++++++++++-- 2 files changed, 82 insertions(+), 11 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 8c95578757..84cbc58607 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -67,6 +67,35 @@ async def change_master_start_timeout(ops_test: OpsTest, seconds: Optional[int]) ) +async def change_wal_settings( + ops_test: OpsTest, unit_name: str, max_wal_size: int, min_wal_size, wal_keep_segments +) -> None: + """Change wal_keep_segments configuration. + + Args: + ops_test: ops_test instance. + unit_name: name of the unit to change wal_keep_segments configuration. + max_wal_size: maximum amount of WAL to keep (MB). + min_wal_size: minimum amount of WAL to keep (MB). + wal_keep_segments: number of WAL segments to keep. + """ + for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): + with attempt: + unit_ip = get_unit_address(ops_test, unit_name) + requests.patch( + f"http://{unit_ip}:8008/config", + json={ + "postgresql": { + "parameters": { + "max_wal_size": max_wal_size, + "min_wal_size": min_wal_size, + "wal_keep_segments": wal_keep_segments, + } + } + }, + ) + + async def count_writes(ops_test: OpsTest, down_unit: str = None) -> int: """Count the number of writes in the database.""" app = await app_name(ops_test) @@ -196,14 +225,17 @@ async def get_primary(ops_test: OpsTest, app) -> str: async def list_wal_files(ops_test: OpsTest, app: str): units = [unit.name for unit in ops_test.model.applications[app].units] - command = "ls -al /var/lib/postgresql/data/pgdata/pg_wal/" + command = "ls -1 /var/lib/postgresql/data/pgdata/pg_wal/" + files = {} for unit in units: - print(f"unit name: {unit}") complete_command = f"run --unit {unit} -- {command}" return_code, stdout, stderr = await ops_test.juju(*complete_command.split()) - print(f"return_code: {return_code}") - print(f"stdout: {stdout}") - print(f"stderr: {stderr}") + files[unit] = stdout.splitlines() + files[unit] = [ + i for i in files[unit] if ".history" not in i and i != "" and i != "archive_status" + ] + files[unit].append("archive_status/*") + return files async def send_signal_to_process( @@ -257,7 +289,7 @@ async def secondary_up_to_date(ops_test: OpsTest, unit_name: str, expected_write ) try: - for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): with attempt: with psycopg2.connect( connection_string diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 4e4d950ffd..b7eaacc85c 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -11,6 +11,7 @@ RESTART_DELAY, app_name, change_master_start_timeout, + change_wal_settings, count_writes, fetch_cluster_members, get_master_start_timeout, @@ -208,9 +209,29 @@ async def test_restart_db_process( new_primary_name = await get_primary(ops_test, app) assert new_primary_name != primary_name + # Verify that the old primary is now a replica. + assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster." + + # Verify that all units are part of the same cluster. + member_ips = await fetch_cluster_members(ops_test) + ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] + assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." + + # Verify that no writes to the database were missed after stopping the writes. + total_expected_writes = await stop_continuous_writes(ops_test) + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + actual_writes = await count_writes(ops_test) + assert total_expected_writes == actual_writes, "writes to the db were missed." + + # Verify that old primary is up-to-date. + assert await secondary_up_to_date( + ops_test, primary_name, total_expected_writes + ), "secondary not up to date with the cluster after restarting." + @pytest.mark.ha_self_healing_tests -@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) +@pytest.mark.parametrize("process", DB_PROCESSES) async def test_sst( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout, reset_restart_delay ) -> None: @@ -271,19 +292,34 @@ async def test_sst( more_writes = await count_writes(ops_test, primary_name) assert more_writes > writes, "writes not continuing to DB" + for unit in ops_test.model.applications[app].units: + if unit.name == primary_name: + continue + await change_wal_settings(ops_test, unit.name, 32, 32, 1) + # Write some data to the initial primary (this causes a divergence # in the instances' timelines). - await list_wal_files(ops_test, app) - print(f"primary_name: {primary_name}") - print(f"new_primary_name: {new_primary_name}") + files = await list_wal_files(ops_test, app) host = get_unit_address(ops_test, new_primary_name) password = await get_password(ops_test, new_primary_name) with db_connect(host, password) as connection: connection.autocommit = True with connection.cursor() as cursor: + slot_name = primary_name.replace("/", "_") + cursor.execute( + f"SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name = '{slot_name}';" + ) + cursor.execute("SELECT pg_switch_wal();") + cursor.execute("CHECKPOINT;") cursor.execute("SELECT pg_switch_wal();") + # cursor.execute("CHECKPOINT;") + # cursor.execute("SELECT pg_switch_wal();") + # cursor.execute("CHECKPOINT;") + # cursor.execute("SELECT pg_switch_wal();") connection.close() - print("...") + new_files = await list_wal_files(ops_test, app) + print(f"files: {files}") + print(f"new_files: {new_files}") await list_wal_files(ops_test, app) # Verify that the database service got restarted and is ready in the old primary. @@ -308,3 +344,6 @@ async def test_sst( assert await secondary_up_to_date( ops_test, primary_name, total_expected_writes ), "secondary not up to date with the cluster after restarting." + + # for unit in ops_test.model.applications[app].units: + # await change_wal_keep_segments(ops_test, unit.name, 0) From 50721e10f09513d1ccc3400595eff2727a154969 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 28 Nov 2022 17:41:08 -0300 Subject: [PATCH 23/28] Small improvements --- tests/integration/ha_tests/helpers.py | 9 +++--- .../integration/ha_tests/test_self_healing.py | 30 ++++++++----------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 84cbc58607..002900024f 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -70,11 +70,11 @@ async def change_master_start_timeout(ops_test: OpsTest, seconds: Optional[int]) async def change_wal_settings( ops_test: OpsTest, unit_name: str, max_wal_size: int, min_wal_size, wal_keep_segments ) -> None: - """Change wal_keep_segments configuration. + """Change WAL settings in the unit. Args: ops_test: ops_test instance. - unit_name: name of the unit to change wal_keep_segments configuration. + unit_name: name of the unit to change the WAL settings. max_wal_size: maximum amount of WAL to keep (MB). min_wal_size: minimum amount of WAL to keep (MB). wal_keep_segments: number of WAL segments to keep. @@ -231,10 +231,9 @@ async def list_wal_files(ops_test: OpsTest, app: str): complete_command = f"run --unit {unit} -- {command}" return_code, stdout, stderr = await ops_test.juju(*complete_command.split()) files[unit] = stdout.splitlines() - files[unit] = [ + files[unit] = { i for i in files[unit] if ".history" not in i and i != "" and i != "archive_status" - ] - files[unit].append("archive_status/*") + } return files diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index b7eaacc85c..6660ef8357 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -177,7 +177,7 @@ async def test_freeze_db_process( ), "secondary not up to date with the cluster after restarting." -@pytest.mark.ha_self_healing_tests +# @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_restart_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -231,7 +231,7 @@ async def test_restart_db_process( @pytest.mark.ha_self_healing_tests -@pytest.mark.parametrize("process", DB_PROCESSES) +@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) async def test_sst( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout, reset_restart_delay ) -> None: @@ -254,8 +254,8 @@ async def test_sst( original_master_start_timeout = await get_master_start_timeout(ops_test) await change_master_start_timeout(ops_test, 0) - # Update all units to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes - # should ensure enough time for all replicas to be down at the same time. + # Update the primary unit to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes + # should ensure enough time for the test. for unit in ops_test.model.applications[app].units: if unit.name == primary_name: await update_restart_delay(ops_test, unit, RESTART_DELAY) @@ -293,12 +293,12 @@ async def test_sst( assert more_writes > writes, "writes not continuing to DB" for unit in ops_test.model.applications[app].units: - if unit.name == primary_name: + if unit.name == new_primary_name: continue await change_wal_settings(ops_test, unit.name, 32, 32, 1) + break - # Write some data to the initial primary (this causes a divergence - # in the instances' timelines). + # Rotate the WAL segments. files = await list_wal_files(ops_test, app) host = get_unit_address(ops_test, new_primary_name) password = await get_password(ops_test, new_primary_name) @@ -312,15 +312,14 @@ async def test_sst( cursor.execute("SELECT pg_switch_wal();") cursor.execute("CHECKPOINT;") cursor.execute("SELECT pg_switch_wal();") - # cursor.execute("CHECKPOINT;") - # cursor.execute("SELECT pg_switch_wal();") - # cursor.execute("CHECKPOINT;") - # cursor.execute("SELECT pg_switch_wal();") connection.close() new_files = await list_wal_files(ops_test, app) - print(f"files: {files}") - print(f"new_files: {new_files}") - await list_wal_files(ops_test, app) + for unit_name in files: + assert not files[unit_name].intersection( + new_files + ), "WAL segments weren't correctly rotated" + + # await update_restart_delay(ops_test, primary_unit, ORIGINAL_RESTART_DELAY) # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) @@ -344,6 +343,3 @@ async def test_sst( assert await secondary_up_to_date( ops_test, primary_name, total_expected_writes ), "secondary not up to date with the cluster after restarting." - - # for unit in ops_test.model.applications[app].units: - # await change_wal_keep_segments(ops_test, unit.name, 0) From addf71622f668f7cc38195441c6bc729b8e96f90 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Mon, 28 Nov 2022 18:02:01 -0300 Subject: [PATCH 24/28] Add comments --- tests/integration/ha_tests/conftest.py | 16 +++++++++++++ tests/integration/ha_tests/helpers.py | 24 +++++++++++++++++-- .../integration/ha_tests/test_self_healing.py | 16 +++++++++---- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/tests/integration/ha_tests/conftest.py b/tests/integration/ha_tests/conftest.py index 030675ab3a..11e3b64340 100644 --- a/tests/integration/ha_tests/conftest.py +++ b/tests/integration/ha_tests/conftest.py @@ -51,3 +51,19 @@ async def reset_restart_delay(ops_test: OpsTest): app = await app_name(ops_test) for unit in ops_test.model.applications[app].units: await update_restart_delay(ops_test, unit, ORIGINAL_RESTART_DELAY) + + +@pytest.fixture() +async def wal_settings(ops_test: OpsTest) -> None: + # """Restore the WAL settings to the initial values.""" + # # Get the value for each setting. + # initial_max_wal_size, initial_min_wal_size, initial_wal_keep_segments = + # await get_wal_settings( + # ops_test + # ) + # yield + # # Rollback to the initial settings. + # await change_wal_settings( + # ops_test, initial_max_wal_size, initial_min_wal_size, initial_wal_keep_segments + # ) + pass diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 002900024f..fa9db6f504 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -2,7 +2,7 @@ # See LICENSE file for licensing details. import subprocess from pathlib import Path -from typing import Optional +from typing import Optional, Set import psycopg2 import requests @@ -162,6 +162,25 @@ async def get_master_start_timeout(ops_test: OpsTest) -> Optional[int]: return int(master_start_timeout) if master_start_timeout is not None else None +async def get_wal_settings(ops_test: OpsTest) -> Optional[int]: + """Get a list of the WAL settings used in the tests. + + Args: + ops_test: ops_test instance. + + Returns: + master start timeout in seconds or None if it's using the default value. + """ + for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): + with attempt: + app = await app_name(ops_test) + primary_name = await get_primary(ops_test, app) + unit_ip = get_unit_address(ops_test, primary_name) + configuration_info = requests.get(f"http://{unit_ip}:8008/config") + initial_max_wal_size = configuration_info.json().get("initial_max_wal_size") + return int(initial_max_wal_size) if initial_max_wal_size is not None else None + + async def get_password(ops_test: OpsTest, app: str, down_unit: str = None) -> str: """Use the charm action to retrieve the password from provided application. @@ -223,7 +242,8 @@ async def get_primary(ops_test: OpsTest, app) -> str: return action.results["primary"] -async def list_wal_files(ops_test: OpsTest, app: str): +async def list_wal_files(ops_test: OpsTest, app: str) -> Set: + """Returns the list of WAL segment files in each unit.""" units = [unit.name for unit in ops_test.model.applications[app].units] command = "ls -1 /var/lib/postgresql/data/pgdata/pg_wal/" files = {} diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 6660ef8357..0463081786 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -233,7 +233,12 @@ async def test_restart_db_process( @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) async def test_sst( - ops_test: OpsTest, process: str, continuous_writes, master_start_timeout, reset_restart_delay + ops_test: OpsTest, + process: str, + continuous_writes, + master_start_timeout, + reset_restart_delay, + wal_settings, ) -> None: """The SST test. @@ -292,11 +297,11 @@ async def test_sst( more_writes = await count_writes(ops_test, primary_name) assert more_writes > writes, "writes not continuing to DB" + # Change some settings to enable WAL rotation. for unit in ops_test.model.applications[app].units: - if unit.name == new_primary_name: + if unit.name == primary_name: continue await change_wal_settings(ops_test, unit.name, 32, 32, 1) - break # Rotate the WAL segments. files = await list_wal_files(ops_test, app) @@ -305,22 +310,23 @@ async def test_sst( with db_connect(host, password) as connection: connection.autocommit = True with connection.cursor() as cursor: + # Remove the replication slot of the former primary to enable WAL rotation. slot_name = primary_name.replace("/", "_") cursor.execute( f"SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name = '{slot_name}';" ) + # Run some commands to make PostgreSQL do WAL rotation. cursor.execute("SELECT pg_switch_wal();") cursor.execute("CHECKPOINT;") cursor.execute("SELECT pg_switch_wal();") connection.close() new_files = await list_wal_files(ops_test, app) + # Check that the WAL was correctly rotated. for unit_name in files: assert not files[unit_name].intersection( new_files ), "WAL segments weren't correctly rotated" - # await update_restart_delay(ops_test, primary_unit, ORIGINAL_RESTART_DELAY) - # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) From deb8d2312f89663627285f245b236960ea8a5b31 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 29 Nov 2022 09:53:53 -0300 Subject: [PATCH 25/28] Change the way service is stopped --- tests/integration/ha_tests/conftest.py | 36 +++++------- tests/integration/ha_tests/helpers.py | 58 +++++-------------- .../integration/ha_tests/test_self_healing.py | 32 +++++----- tests/integration/helpers.py | 16 +++++ 4 files changed, 59 insertions(+), 83 deletions(-) diff --git a/tests/integration/ha_tests/conftest.py b/tests/integration/ha_tests/conftest.py index 11e3b64340..08ab708f2d 100644 --- a/tests/integration/ha_tests/conftest.py +++ b/tests/integration/ha_tests/conftest.py @@ -5,11 +5,11 @@ from pytest_operator.plugin import OpsTest from tests.integration.ha_tests.helpers import ( - ORIGINAL_RESTART_DELAY, app_name, change_master_start_timeout, + change_wal_settings, get_master_start_timeout, - update_restart_delay, + get_postgresql_parameter, ) APPLICATION_NAME = "application" @@ -45,25 +45,21 @@ async def master_start_timeout(ops_test: OpsTest) -> None: @pytest.fixture() -async def reset_restart_delay(ops_test: OpsTest): - """Resets service file delay on all units.""" +async def wal_settings(ops_test: OpsTest) -> None: + """Restore the WAL settings to the initial values.""" + # Get the value for each setting. + initial_max_wal_size = await get_postgresql_parameter(ops_test, "max_wal_size") + initial_min_wal_size = await get_postgresql_parameter(ops_test, "min_wal_size") + initial_wal_keep_segments = await get_postgresql_parameter(ops_test, "wal_keep_segments") yield + # Rollback to the initial settings. app = await app_name(ops_test) for unit in ops_test.model.applications[app].units: - await update_restart_delay(ops_test, unit, ORIGINAL_RESTART_DELAY) - - -@pytest.fixture() -async def wal_settings(ops_test: OpsTest) -> None: - # """Restore the WAL settings to the initial values.""" - # # Get the value for each setting. - # initial_max_wal_size, initial_min_wal_size, initial_wal_keep_segments = - # await get_wal_settings( - # ops_test - # ) - # yield - # # Rollback to the initial settings. - # await change_wal_settings( - # ops_test, initial_max_wal_size, initial_min_wal_size, initial_wal_keep_segments - # ) + await change_wal_settings( + ops_test, + unit.name, + initial_max_wal_size, + initial_min_wal_size, + initial_wal_keep_segments, + ) pass diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index fa9db6f504..104ef84c6c 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -1,6 +1,5 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -import subprocess from pathlib import Path from typing import Optional, Set @@ -162,23 +161,30 @@ async def get_master_start_timeout(ops_test: OpsTest) -> Optional[int]: return int(master_start_timeout) if master_start_timeout is not None else None -async def get_wal_settings(ops_test: OpsTest) -> Optional[int]: - """Get a list of the WAL settings used in the tests. +async def get_postgresql_parameter(ops_test: OpsTest, parameter_name: str) -> Optional[int]: + """Get the value of a PostgreSQL parameter from Patroni API. Args: ops_test: ops_test instance. + parameter_name: the name of the parameter to get the value for. Returns: - master start timeout in seconds or None if it's using the default value. + the value of the requested PostgreSQL parameter. """ - for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)): + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) unit_ip = get_unit_address(ops_test, primary_name) configuration_info = requests.get(f"http://{unit_ip}:8008/config") - initial_max_wal_size = configuration_info.json().get("initial_max_wal_size") - return int(initial_max_wal_size) if initial_max_wal_size is not None else None + postgresql_dict = configuration_info.json().get("postgresql") + if postgresql_dict is None: + return None + parameters = postgresql_dict.get("parameters") + if parameters is None: + return None + parameter_value = parameters.get(parameter_name) + return parameter_value async def get_password(ops_test: OpsTest, app: str, down_unit: str = None) -> str: @@ -356,41 +362,3 @@ async def stop_continuous_writes(ops_test: OpsTest) -> int: ) action = await action.wait() return int(action.results["writes"]) - - -async def update_restart_delay(ops_test: OpsTest, unit, delay: int): - """Updates the restart delay in the DB service file. - - When the DB service fails it will now wait for `delay` number of seconds. - """ - # Load the service file from the unit and update it with the new delay. - await unit.scp_from(source=PATRONI_SERVICE_DEFAULT_PATH, destination=TMP_SERVICE_PATH) - with open(TMP_SERVICE_PATH, "r") as patroni_service_file: - patroni_service = patroni_service_file.readlines() - - for index, line in enumerate(patroni_service): - if "RestartSec" in line: - patroni_service[index] = f"RestartSec={delay}s\n" - - with open(TMP_SERVICE_PATH, "w") as service_file: - service_file.writelines(patroni_service) - - # Upload the changed file back to the unit, we cannot scp this file directly to - # PATRONI_SERVICE_DEFAULT_PATH since this directory has strict permissions, instead we scp it - # elsewhere and then move it to PATRONI_SERVICE_DEFAULT_PATH. - await unit.scp_to(source=TMP_SERVICE_PATH, destination="patroni.service") - mv_cmd = ( - f"run --unit {unit.name} mv /home/ubuntu/patroni.service {PATRONI_SERVICE_DEFAULT_PATH}" - ) - return_code, _, _ = await ops_test.juju(*mv_cmd.split()) - if return_code != 0: - raise ProcessError("Command: %s failed on unit: %s.", mv_cmd, unit.name) - - # Remove temporary file from machine. - subprocess.call(["rm", TMP_SERVICE_PATH]) - - # Reload the daemon for systemd otherwise changes are not saved. - reload_cmd = f"run --unit {unit.name} systemctl daemon-reload" - return_code, _, _ = await ops_test.juju(*reload_cmd.split()) - if return_code != 0: - raise ProcessError("Command: %s failed on unit: %s.", reload_cmd, unit.name) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 0463081786..d3eceaaac6 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -8,7 +8,6 @@ from tests.integration.ha_tests.helpers import ( METADATA, - RESTART_DELAY, app_name, change_master_start_timeout, change_wal_settings, @@ -23,9 +22,13 @@ send_signal_to_process, start_continuous_writes, stop_continuous_writes, - update_restart_delay, ) -from tests.integration.helpers import db_connect, get_password, get_unit_address +from tests.integration.helpers import ( + db_connect, + get_password, + get_unit_address, + run_command_on_unit, +) APP_NAME = METADATA["name"] PATRONI_PROCESS = "/usr/local/bin/patroni" @@ -232,18 +235,14 @@ async def test_restart_db_process( @pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) -async def test_sst( +async def test_forceful_restart_without_data_and_transaction_logs( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout, - reset_restart_delay, wal_settings, ) -> None: - """The SST test. - - A forceful restart instance with deleted data and without transaction logs (forced clone). - """ + """A forceful restart with deleted data and without transaction logs (forced clone).""" app = await app_name(ops_test) primary_name = await get_primary(ops_test, app) @@ -259,15 +258,8 @@ async def test_sst( original_master_start_timeout = await get_master_start_timeout(ops_test) await change_master_start_timeout(ops_test, 0) - # Update the primary unit to have a new RESTART_DELAY. Modifying the Restart delay to 3 minutes - # should ensure enough time for the test. - for unit in ops_test.model.applications[app].units: - if unit.name == primary_name: - await update_restart_delay(ops_test, unit, RESTART_DELAY) - break - - # Restart all units "simultaneously". - await send_signal_to_process(ops_test, primary_name, process, kill_code="SIGTERM") + # Stop the systemd service on the primary unit. + await run_command_on_unit(ops_test, primary_name, "systemctl stop patroni") # Data removal runs within a script, so it allows `*` expansion. return_code, _, _ = await ops_test.juju( @@ -283,6 +275,7 @@ async def test_sst( for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): with attempt: new_primary_name = await get_primary(ops_test, app) + assert new_primary_name is not None assert new_primary_name != primary_name # Revert the "master_start_timeout" parameter to avoid fail-over again. @@ -327,6 +320,9 @@ async def test_sst( new_files ), "WAL segments weren't correctly rotated" + # Start the systemd service in the old primary. + await run_command_on_unit(ops_test, primary_name, "systemctl start patroni") + # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 3a674b94a0..5f3a3a287b 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -558,6 +558,22 @@ async def check_tls_patroni_api(ops_test: OpsTest, unit_name: str, enabled: bool return False +async def run_command_on_unit(ops_test: OpsTest, unit_name: str, command: str) -> None: + """Run a command on a specific unit. + + Args: + ops_test: The ops test framework instance + unit_name: The name of the unit to run the command on + command: The command to run + """ + complete_command = f"run --unit {unit_name} -- {command}" + return_code, stdout, _ = await ops_test.juju(*complete_command.split()) + if return_code != 0: + raise Exception( + "Expected command %s to succeed instead it failed: %s", command, return_code + ) + + async def scale_application(ops_test: OpsTest, application_name: str, count: int) -> None: """Scale a given application to a specific unit count. From d8d58ac131b7521aebac81446f7943c54801abdb Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 29 Nov 2022 10:49:16 -0300 Subject: [PATCH 26/28] Remove slot removal --- tests/integration/ha_tests/test_self_healing.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index d3eceaaac6..faac3260cd 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -303,11 +303,6 @@ async def test_forceful_restart_without_data_and_transaction_logs( with db_connect(host, password) as connection: connection.autocommit = True with connection.cursor() as cursor: - # Remove the replication slot of the former primary to enable WAL rotation. - slot_name = primary_name.replace("/", "_") - cursor.execute( - f"SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots WHERE slot_name = '{slot_name}';" - ) # Run some commands to make PostgreSQL do WAL rotation. cursor.execute("SELECT pg_switch_wal();") cursor.execute("CHECKPOINT;") From 730e9d46781e39f29deb51d43a8c84b4a64bf6d1 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 29 Nov 2022 10:54:44 -0300 Subject: [PATCH 27/28] Small fixes --- tests/integration/ha_tests/conftest.py | 1 - tests/integration/ha_tests/helpers.py | 4 ---- tests/integration/ha_tests/test_self_healing.py | 14 ++++++-------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/tests/integration/ha_tests/conftest.py b/tests/integration/ha_tests/conftest.py index 08ab708f2d..68efb7e802 100644 --- a/tests/integration/ha_tests/conftest.py +++ b/tests/integration/ha_tests/conftest.py @@ -62,4 +62,3 @@ async def wal_settings(ops_test: OpsTest) -> None: initial_min_wal_size, initial_wal_keep_segments, ) - pass diff --git a/tests/integration/ha_tests/helpers.py b/tests/integration/ha_tests/helpers.py index 104ef84c6c..dd9a69aa72 100644 --- a/tests/integration/ha_tests/helpers.py +++ b/tests/integration/ha_tests/helpers.py @@ -14,10 +14,6 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) PORT = 5432 APP_NAME = METADATA["name"] -PATRONI_SERVICE_DEFAULT_PATH = "/etc/systemd/system/patroni.service" -TMP_SERVICE_PATH = "tests/integration/ha_tests/tmp.service" -RESTART_DELAY = 60 * 3 -ORIGINAL_RESTART_DELAY = 30 class MemberNotListedOnClusterError(Exception): diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index faac3260cd..2036ea37d6 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -52,7 +52,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.wait_for_idle(status="active", timeout=1000) -# @pytest.mark.ha_self_healing_tests +@pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_kill_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -84,11 +84,9 @@ async def test_kill_db_process( # Verify that the database service got restarted and is ready in the old primary. assert await postgresql_ready(ops_test, primary_name) - # Verify that a new primary gets elected (ie old primary is secondary). - for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): - with attempt: - new_primary_name = await get_primary(ops_test, app) - assert new_primary_name != primary_name + # Verify that a new primary gets elected (ie old primary is secondary). + new_primary_name = await get_primary(ops_test, app) + assert new_primary_name != primary_name # Revert the "master_start_timeout" parameter to avoid fail-over again. await change_master_start_timeout(ops_test, original_master_start_timeout) @@ -114,7 +112,7 @@ async def test_kill_db_process( ), "secondary not up to date with the cluster after restarting." -# @pytest.mark.ha_self_healing_tests +@pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_freeze_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout @@ -180,7 +178,7 @@ async def test_freeze_db_process( ), "secondary not up to date with the cluster after restarting." -# @pytest.mark.ha_self_healing_tests +@pytest.mark.ha_self_healing_tests @pytest.mark.parametrize("process", DB_PROCESSES) async def test_restart_db_process( ops_test: OpsTest, process: str, continuous_writes, master_start_timeout From 3497cf333439da6651161143594db1a15da3f658 Mon Sep 17 00:00:00 2001 From: Marcelo Henrique Neppel Date: Tue, 29 Nov 2022 11:40:32 -0300 Subject: [PATCH 28/28] Remove unussed parameter --- tests/integration/ha_tests/test_self_healing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py index 2036ea37d6..3046d18347 100644 --- a/tests/integration/ha_tests/test_self_healing.py +++ b/tests/integration/ha_tests/test_self_healing.py @@ -232,10 +232,8 @@ async def test_restart_db_process( @pytest.mark.ha_self_healing_tests -@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) async def test_forceful_restart_without_data_and_transaction_logs( ops_test: OpsTest, - process: str, continuous_writes, master_start_timeout, wal_settings,