-
Notifications
You must be signed in to change notification settings - Fork 27
Freeze DB process test #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
79daddd
Add alternative servers for primary and members retrieval
marceloneppel 7811a0b
Test working
marceloneppel a4b76c8
Test working
marceloneppel ba63682
Cleanup the code
marceloneppel b48a3bd
More cleanup
marceloneppel 800ad8f
Small adjustments
marceloneppel 5d679df
Merge branch 'main' into freeze-db-process-test
marceloneppel 9ab9bf3
Add unit tests
marceloneppel 1c0d12d
Improve comments
marceloneppel ec6705d
Use down unit
marceloneppel 2108c4a
Improve alternative URL description
marceloneppel 6a1e39b
Add additional checks
marceloneppel 29eae5b
Improve returns
marceloneppel File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,11 +9,15 @@ | |
| from tests.integration.ha_tests.helpers import ( | ||
| METADATA, | ||
| app_name, | ||
| change_master_start_timeout, | ||
| count_writes, | ||
| fetch_cluster_members, | ||
| get_master_start_timeout, | ||
| get_primary, | ||
| kill_process, | ||
| is_replica, | ||
| postgresql_ready, | ||
| secondary_up_to_date, | ||
| send_signal_to_process, | ||
| start_continuous_writes, | ||
| stop_continuous_writes, | ||
| ) | ||
|
|
@@ -52,8 +56,12 @@ async def test_kill_db_process( | |
| # Start an application that continuously writes data to the database. | ||
| await start_continuous_writes(ops_test, app) | ||
|
|
||
| # Change the "master_start_timeout" parameter to speed up the fail-over. | ||
| original_master_start_timeout = await get_master_start_timeout(ops_test) | ||
| await change_master_start_timeout(ops_test, 0) | ||
|
|
||
| # Kill the database process. | ||
| await kill_process(ops_test, primary_name, process, kill_code="SIGKILL") | ||
| await send_signal_to_process(ops_test, primary_name, process, kill_code="SIGKILL") | ||
|
|
||
| async with ops_test.fast_forward(): | ||
| # Verify new writes are continuing by counting the number of writes before and after a | ||
|
|
@@ -72,6 +80,83 @@ async def test_kill_db_process( | |
| new_primary_name = await get_primary(ops_test, app) | ||
| assert new_primary_name != primary_name | ||
|
|
||
| # Revert the "master_start_timeout" parameter to avoid fail-over again. | ||
| await change_master_start_timeout(ops_test, original_master_start_timeout) | ||
|
|
||
| # Verify that the old primary is now a replica. | ||
| assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster." | ||
|
|
||
| # Verify that all units are part of the same cluster. | ||
| member_ips = await fetch_cluster_members(ops_test) | ||
| ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] | ||
| assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." | ||
|
|
||
| # Verify that no writes to the database were missed after stopping the writes. | ||
| total_expected_writes = await stop_continuous_writes(ops_test) | ||
| for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): | ||
| with attempt: | ||
| actual_writes = await count_writes(ops_test) | ||
| assert total_expected_writes == actual_writes, "writes to the db were missed." | ||
|
|
||
| # Verify that old primary is up-to-date. | ||
| assert await secondary_up_to_date( | ||
| ops_test, primary_name, total_expected_writes | ||
| ), "secondary not up to date with the cluster after restarting." | ||
|
|
||
|
|
||
| @pytest.mark.ha_self_healing_tests | ||
| @pytest.mark.parametrize("process", DB_PROCESSES) | ||
| async def test_freeze_db_process( | ||
| ops_test: OpsTest, process: str, continuous_writes, master_start_timeout | ||
| ) -> None: | ||
| # Locate primary unit. | ||
| app = await app_name(ops_test) | ||
| primary_name = await get_primary(ops_test, app) | ||
|
|
||
| # Start an application that continuously writes data to the database. | ||
| await start_continuous_writes(ops_test, app) | ||
|
|
||
| # Change the "master_start_timeout" parameter to speed up the fail-over. | ||
| original_master_start_timeout = await get_master_start_timeout(ops_test) | ||
| await change_master_start_timeout(ops_test, 0) | ||
|
|
||
| # Freeze the database process. | ||
| await send_signal_to_process(ops_test, primary_name, process, "SIGSTOP") | ||
|
|
||
| async with ops_test.fast_forward(): | ||
| # Verify new writes are continuing by counting the number of writes before and after a | ||
| # 3 minutes wait (this is a little more than the loop wait configuration, that is | ||
| # considered to trigger a fail-over after master_start_timeout is changed, and also | ||
| # when freezing the DB process it take some more time to trigger the fail-over). | ||
| writes = await count_writes(ops_test, primary_name) | ||
| for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): | ||
| with attempt: | ||
| more_writes = await count_writes(ops_test, primary_name) | ||
| assert more_writes > writes, "writes not continuing to DB" | ||
|
|
||
| # Verify that a new primary gets elected (ie old primary is secondary). | ||
| for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(3)): | ||
| with attempt: | ||
| new_primary_name = await get_primary(ops_test, app) | ||
| assert new_primary_name != primary_name | ||
|
|
||
| # Revert the "master_start_timeout" parameter to avoid fail-over again. | ||
| await change_master_start_timeout(ops_test, original_master_start_timeout) | ||
|
|
||
| # Un-freeze the old primary. | ||
| await send_signal_to_process(ops_test, primary_name, process, "SIGCONT") | ||
|
|
||
| # Verify that the database service got restarted and is ready in the old primary. | ||
| assert await postgresql_ready(ops_test, primary_name) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add checks that verify:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great suggestion. Added both on 6a1e39b. I also added those checks on the kill DB process test. |
||
|
|
||
| # Verify that the old primary is now a replica. | ||
| assert is_replica(ops_test, primary_name), "there are more than one primary in the cluster." | ||
|
|
||
| # Verify that all units are part of the same cluster. | ||
| member_ips = await fetch_cluster_members(ops_test) | ||
| ip_addresses = [unit.public_address for unit in ops_test.model.applications[app].units] | ||
| assert set(member_ips) == set(ip_addresses), "not all units are part of the same cluster." | ||
|
|
||
| # Verify that no writes to the database were missed after stopping the writes. | ||
| total_expected_writes = await stop_continuous_writes(ops_test) | ||
| for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
great function 🤩