-
Notifications
You must be signed in to change notification settings - Fork 27
SST test #43
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SST test #43
Changes from 21 commits
04223a8
ebc4814
84fca0c
d5d0ad8
1e64d31
4688a95
2b76d2e
edcfb0c
51cee09
f05a540
5a97730
072252a
f340ccf
b537e82
6c0db4b
60bc8e2
dc3dd8f
6772702
3348c2b
f73aefc
c539f78
ed987ed
b1c8c2c
8e507a0
6fcdebd
107e24c
50721e1
addf716
deb8d23
d8d58ac
730e9d4
3497cf3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -Eeuo pipefail | ||
rm -rf /var/lib/postgresql/data/pgdata/* |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,9 +76,11 @@ async def test_kill_db_process( | |
# Verify that the database service got restarted and is ready in the old primary. | ||
assert await postgresql_ready(ops_test, primary_name) | ||
|
||
# Verify that a new primary gets elected (ie old primary is secondary). | ||
new_primary_name = await get_primary(ops_test, app) | ||
assert new_primary_name != primary_name | ||
# Verify that a new primary gets elected (ie old primary is secondary). | ||
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): | ||
with attempt: | ||
new_primary_name = await get_primary(ops_test, app) | ||
assert new_primary_name != primary_name | ||
|
||
# Revert the "master_start_timeout" parameter to avoid fail-over again. | ||
await change_master_start_timeout(ops_test, original_master_start_timeout) | ||
|
@@ -168,3 +170,83 @@ async def test_freeze_db_process( | |
assert await secondary_up_to_date( | ||
ops_test, primary_name, total_expected_writes | ||
), "secondary not up to date with the cluster after restarting." | ||
|
||
|
||
@pytest.mark.ha_self_healing_tests | ||
@pytest.mark.parametrize("process", [POSTGRESQL_PROCESS]) | ||
async def test_sst( | ||
ops_test: OpsTest, process: str, continuous_writes, master_start_timeout | ||
) -> None: | ||
"""The SST test. | ||
|
||
A forceful restart instance with deleted data and without transaction logs (forced clone). | ||
""" | ||
app = await app_name(ops_test) | ||
primary_name = await get_primary(ops_test, app) | ||
|
||
# Start an application that continuously writes data to the database. | ||
await start_continuous_writes(ops_test, app) | ||
|
||
# Change the "master_start_timeout" parameter to speed up the fail-over. | ||
original_master_start_timeout = await get_master_start_timeout(ops_test) | ||
await change_master_start_timeout(ops_test, 0) | ||
|
||
# Copy data dir content removal script. | ||
await ops_test.juju( | ||
"scp", "tests/integration/ha_tests/clean-data-dir.sh", f"{primary_name}:/tmp" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i believe that for this test, we need to remove the data directory while the postgress process is down (we need to extend the systemd restart timeout like mongodb here) an excerpt of a message from mykola:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the additional details, Shayan! I'll update the PR to have a similar approach on PostgreSQL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @shayancanonical! I updated the code to have the right steps that simulate the needed scenario. I haven't changed the systemd service restart timeout as after stopping the service it was not being restarted until I request to start it again. I also added a check to ensure the WAL files (the equivalent to MysQL binlog) are correctly rotated (a new one is created - in fact, more than one new WAL file is kept due to some settings that enabled the old ones to be removed). |
||
|
||
# Force a restart of the database process. | ||
await send_signal_to_process(ops_test, primary_name, process, "SIGKILL") | ||
|
||
# Data removal runs within a script, so it allows `*` expansion. | ||
return_code, _, _ = await ops_test.juju( | ||
"ssh", | ||
primary_name, | ||
"sudo", | ||
"/tmp/clean-data-dir.sh", | ||
) | ||
assert return_code == 0, "Failed to remove data directory" | ||
|
||
async with ops_test.fast_forward(): | ||
# Verify new writes are continuing by counting the number of writes before and after a | ||
# 3 minutes wait (this is a little more than the loop wait configuration, that is | ||
# considered to trigger a fail-over after master_start_timeout is changed, and also | ||
# when freezing the DB process it take some more time to trigger the fail-over). | ||
writes = await count_writes(ops_test, primary_name) | ||
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): | ||
with attempt: | ||
more_writes = await count_writes(ops_test, primary_name) | ||
assert more_writes > writes, "writes not continuing to DB" | ||
|
||
# Verify that a new primary gets elected (ie old primary is secondary). | ||
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): | ||
with attempt: | ||
new_primary_name = await get_primary(ops_test, app) | ||
assert new_primary_name != primary_name | ||
|
||
# Revert the "master_start_timeout" parameter to avoid fail-over again. | ||
await change_master_start_timeout(ops_test, original_master_start_timeout) | ||
|
||
# Verify that the database service got restarted and is ready in the old primary. | ||
assert await postgresql_ready(ops_test, primary_name) | ||
|
||
# Verify that the old primary is now a replica. | ||
assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster." | ||
|
||
# Verify that all units are part of the same cluster. | ||
member_ips = await fetch_cluster_members(ops_test) | ||
ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] | ||
assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." | ||
|
||
# Verify that no writes to the database were missed after stopping the writes. | ||
total_expected_writes = await stop_continuous_writes(ops_test) | ||
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): | ||
with attempt: | ||
actual_writes = await count_writes(ops_test) | ||
assert total_expected_writes == actual_writes, "writes to the db were missed." | ||
|
||
# Verify that old primary is up-to-date. | ||
assert await secondary_up_to_date( | ||
ops_test, primary_name, total_expected_writes | ||
), "secondary not up to date with the cluster after restarting." |
Uh oh!
There was an error while loading. Please reload this page.