Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
04223a8
Add SST test
marceloneppel Oct 17, 2022
ebc4814
Enable previous tests
marceloneppel Oct 17, 2022
84fca0c
fix early tls deployment by only reloading patroni config if it's alr…
Oct 18, 2022
d5d0ad8
Improve code
marceloneppel Oct 18, 2022
1e64d31
Remove duplicate check
marceloneppel Oct 18, 2022
4688a95
Remove unused import
marceloneppel Oct 18, 2022
2b76d2e
added unit test for reloading patroni
Oct 18, 2022
edcfb0c
lint
Oct 18, 2022
51cee09
removing postgres restart check
Oct 18, 2022
f05a540
Pin Juju agent version on CI
marceloneppel Oct 18, 2022
5a97730
adding series flags to test apps
Oct 18, 2022
072252a
adding series flags to test apps
Oct 18, 2022
f340ccf
made series into a list
Oct 18, 2022
b537e82
Update test_new_relations.py
WRFitch Oct 18, 2022
6c0db4b
Add retrying
marceloneppel Oct 18, 2022
60bc8e2
updating test to better emulate bundle deploymen
Oct 18, 2022
dc3dd8f
Merge branch 'fix-early-tls-deployment' of https://github.com/canonic…
Oct 18, 2022
6772702
Merge remote-tracking branch 'origin/fix-early-tls-deployment' into s…
marceloneppel Oct 19, 2022
3348c2b
Merge branch 'main' into sst-test
marceloneppel Nov 16, 2022
f73aefc
Remove unused code
marceloneppel Nov 16, 2022
c539f78
Change processes list
marceloneppel Nov 16, 2022
ed987ed
Add logic for ensuring all units down
marceloneppel Nov 18, 2022
b1c8c2c
Change delay to only one unit
marceloneppel Nov 18, 2022
8e507a0
Add WAL switch
marceloneppel Nov 18, 2022
6fcdebd
Merge branch 'main' into sst-test
marceloneppel Nov 28, 2022
107e24c
Updates related to WAL removal
marceloneppel Nov 28, 2022
50721e1
Small improvements
marceloneppel Nov 28, 2022
addf716
Add comments
marceloneppel Nov 28, 2022
deb8d23
Change the way service is stopped
marceloneppel Nov 29, 2022
d8d58ac
Remove slot removal
marceloneppel Nov 29, 2022
730e9d4
Small fixes
marceloneppel Nov 29, 2022
3497cf3
Remove unussed parameter
marceloneppel Nov 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/integration/ha_tests/clean-data-dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

set -Eeuo pipefail
rm -rf /var/lib/postgresql/data/pgdata/*
88 changes: 85 additions & 3 deletions tests/integration/ha_tests/test_self_healing.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ async def test_kill_db_process(
# Verify that the database service got restarted and is ready in the old primary.
assert await postgresql_ready(ops_test, primary_name)

# Verify that a new primary gets elected (ie old primary is secondary).
new_primary_name = await get_primary(ops_test, app)
assert new_primary_name != primary_name
# Verify that a new primary gets elected (ie old primary is secondary).
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
with attempt:
new_primary_name = await get_primary(ops_test, app)
assert new_primary_name != primary_name

# Revert the "master_start_timeout" parameter to avoid fail-over again.
await change_master_start_timeout(ops_test, original_master_start_timeout)
Expand Down Expand Up @@ -168,3 +170,83 @@ async def test_freeze_db_process(
assert await secondary_up_to_date(
ops_test, primary_name, total_expected_writes
), "secondary not up to date with the cluster after restarting."


@pytest.mark.ha_self_healing_tests
@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS])
async def test_sst(
ops_test: OpsTest, process: str, continuous_writes, master_start_timeout
) -> None:
"""The SST test.

A forceful restart instance with deleted data and without transaction logs (forced clone).
"""
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)

# Start an application that continuously writes data to the database.
await start_continuous_writes(ops_test, app)

# Change the "master_start_timeout" parameter to speed up the fail-over.
original_master_start_timeout = await get_master_start_timeout(ops_test)
await change_master_start_timeout(ops_test, 0)

# Copy data dir content removal script.
await ops_test.juju(
"scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i believe that for this test, we need to remove the data directory while the postgress process is down (we need to extend the systemd restart timeout like mongodb here)

an excerpt of a message from mykola:

for SST test (in MySQL):
1) stop pebble/systemd on one member, remove all files in /var/lib/mysql [data directory] (simulate HDD failure)
2) write data to new primary
3) rotate binlog and remove rotated binlog ON ALL alive members. literally remove data written on step 2 (with such we simulate looong period of downtime)
4) run mysql on member from step 1)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the additional details, Shayan! I'll update the PR to have a similar approach on PostgreSQL.

Copy link
Member Author

@marceloneppel marceloneppel Nov 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @shayancanonical! I updated the code to have the right steps that simulate the needed scenario.

I haven't changed the systemd service restart timeout as after stopping the service it was not being restarted until I request to start it again.

I also added a check to ensure the WAL files (the equivalent to MysQL binlog) are correctly rotated (a new one is created - in fact, more than one new WAL file is kept due to some settings that enabled the old ones to be removed).


# Force a restart of the database process.
await send_signal_to_process(ops_test, primary_name, process, "SIGKILL")

# Data removal runs within a script, so it allows `*` expansion.
return_code, _, _ = await ops_test.juju(
"ssh",
primary_name,
"sudo",
"/tmp/clean-data-dir.sh",
)
assert return_code == 0, "Failed to remove data directory"

async with ops_test.fast_forward():
# Verify new writes are continuing by counting the number of writes before and after a
# 3 minutes wait (this is a little more than the loop wait configuration, that is
# considered to trigger a fail-over after master_start_timeout is changed, and also
# when freezing the DB process it take some more time to trigger the fail-over).
writes = await count_writes(ops_test, primary_name)
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
with attempt:
more_writes = await count_writes(ops_test, primary_name)
assert more_writes > writes, "writes not continuing to DB"

# Verify that a new primary gets elected (ie old primary is secondary).
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)):
with attempt:
new_primary_name = await get_primary(ops_test, app)
assert new_primary_name != primary_name

# Revert the "master_start_timeout" parameter to avoid fail-over again.
await change_master_start_timeout(ops_test, original_master_start_timeout)

# Verify that the database service got restarted and is ready in the old primary.
assert await postgresql_ready(ops_test, primary_name)

# Verify that the old primary is now a replica.
assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster."

# Verify that all units are part of the same cluster.
member_ips = await fetch_cluster_members(ops_test)
ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units]
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster."

# Verify that no writes to the database were missed after stopping the writes.
total_expected_writes = await stop_continuous_writes(ops_test)
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
actual_writes = await count_writes(ops_test)
assert total_expected_writes == actual_writes, "writes to the db were missed."

# Verify that old primary is up-to-date.
assert await secondary_up_to_date(
ops_test, primary_name, total_expected_writes
), "secondary not up to date with the cluster after restarting."