canonical · marceloneppel · Dec 1, 2022 · Oct 17, 2022 · Oct 17, 2022 · Oct 18, 2022
diff --git a/tests/integration/ha_tests/clean-data-dir.sh b/tests/integration/ha_tests/clean-data-dir.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+set -Eeuo pipefail
+rm -rf /var/lib/postgresql/data/pgdata/*
diff --git a/tests/integration/ha_tests/test_self_healing.py b/tests/integration/ha_tests/test_self_healing.py
@@ -76,9 +76,11 @@ async def test_kill_db_process(
         # Verify that the database service got restarted and is ready in the old primary.
         assert await postgresql_ready(ops_test, primary_name)
 
-    # Verify that a new primary gets elected (ie old primary is secondary).
-    new_primary_name = await get_primary(ops_test, app)
-    assert new_primary_name != primary_name
+        # Verify that a new primary gets elected (ie old primary is secondary).
+        for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
+            with attempt:
+                new_primary_name = await get_primary(ops_test, app)
+                assert new_primary_name != primary_name
 
     # Revert the "master_start_timeout" parameter to avoid fail-over again.
     await change_master_start_timeout(ops_test, original_master_start_timeout)
@@ -168,3 +170,83 @@ async def test_freeze_db_process(
     assert await secondary_up_to_date(
         ops_test, primary_name, total_expected_writes
     ), "secondary not up to date with the cluster after restarting."
+
+
+@pytest.mark.ha_self_healing_tests
+@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS])
+async def test_sst(
+    ops_test: OpsTest, process: str, continuous_writes, master_start_timeout
+) -> None:
+    """The SST test.
+
+    A forceful restart instance with deleted data and without transaction logs (forced clone).
+    """
+    app = await app_name(ops_test)
+    primary_name = await get_primary(ops_test, app)
+
+    # Start an application that continuously writes data to the database.
+    await start_continuous_writes(ops_test, app)
+
+    # Change the "master_start_timeout" parameter to speed up the fail-over.
+    original_master_start_timeout = await get_master_start_timeout(ops_test)
+    await change_master_start_timeout(ops_test, 0)
+
+    # Copy data dir content removal script.
+    await ops_test.juju(
+        "scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp"
+    )
+
+    # Force a restart of the database process.
+    await send_signal_to_process(ops_test, primary_name, process, "SIGKILL")
+
+    # Data removal runs within a script, so it allows `*` expansion.
+    return_code, _, _ = await ops_test.juju(
+        "ssh",
+        primary_name,
+        "sudo",
+        "/tmp/clean-data-dir.sh",
+    )
+    assert return_code == 0, "Failed to remove data directory"
+
+    async with ops_test.fast_forward():
+        # Verify new writes are continuing by counting the number of writes before and after a
+        # 3 minutes wait (this is a little more than the loop wait configuration, that is
+        # considered to trigger a fail-over after master_start_timeout is changed, and also
+        # when freezing the DB process it take some more time to trigger the fail-over).
+        writes = await count_writes(ops_test, primary_name)
+        for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
+            with attempt:
+                more_writes = await count_writes(ops_test, primary_name)
+                assert more_writes > writes, "writes not continuing to DB"
+
+        # Verify that a new primary gets elected (ie old primary is secondary).
+        for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
+            with attempt:
+                new_primary_name = await get_primary(ops_test, app)
+                assert new_primary_name != primary_name
+
+        # Revert the "master_start_timeout" parameter to avoid fail-over again.
+        await change_master_start_timeout(ops_test, original_master_start_timeout)
+
+        # Verify that the database service got restarted and is ready in the old primary.
+        assert await postgresql_ready(ops_test, primary_name)
+
+    # Verify that the old primary is now a replica.
+    assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster."
+
+    # Verify that all units are part of the same cluster.
+    member_ips = await fetch_cluster_members(ops_test)
+    ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units]
+    assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."
+
+    # Verify that no writes to the database were missed after stopping the writes.
+    total_expected_writes = await stop_continuous_writes(ops_test)
+    for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
+        with attempt:
+            actual_writes = await count_writes(ops_test)
+            assert total_expected_writes == actual_writes, "writes to the db were missed."
+
+    # Verify that old primary is up-to-date.
+    assert await secondary_up_to_date(
+        ops_test, primary_name, total_expected_writes
+    ), "secondary not up to date with the cluster after restarting."