From fc891d5dbd471876dff99ae9aeeab7c36ae7a659 Mon Sep 17 00:00:00 2001 From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> Date: Thu, 2 Nov 2023 09:17:07 +0100 Subject: [PATCH] [DPE-2778] Shard removal tests (#283) ## Issue No tests for shard removal ## Solution Add tests for various removal scenarios ## Extras 1. These tests highlighted an issue with slow drainage. A large TIMEOUT period has been added to the tests and an issue (#282 ) has been added to the repo 2. Wait for relation_changed to be given both key and ops password, as these seem to not always be given from the config server on relation-joined. [discussed here](https://chat.charmhub.io/charmhub/pl/prmxufjyfbnt9qrzdrxoxkqz3e) --- .github/workflows/ci.yaml | 2 +- lib/charms/mongodb/v1/mongos.py | 18 +- lib/charms/mongodb/v1/shards_interface.py | 23 +- tests/integration/sharding_tests/helpers.py | 36 +++ .../sharding_tests/test_sharding.py | 243 ++++++++++++++++-- 5 files changed, 287 insertions(+), 35 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c0bdab19..9e050e44c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -67,7 +67,7 @@ jobs: build: name: Build charms - uses: canonical/data-platform-workflows/.github/workflows/build_charms_with_cache.yaml@v2 + uses: canonical/data-platform-workflows/.github/workflows/build_charms_with_cache.yaml@v5.1.2 integration-test: strategy: diff --git a/lib/charms/mongodb/v1/mongos.py b/lib/charms/mongodb/v1/mongos.py index 774b3da14..a8d5455d6 100644 --- a/lib/charms/mongodb/v1/mongos.py +++ b/lib/charms/mongodb/v1/mongos.py @@ -235,10 +235,14 @@ def remove_shard(self, shard_name: str) -> None: ) self._move_primary(databases_using_shard_as_primary, old_primary=shard_name) - # MongoDB docs says to re-run removeShard after running movePrimary - logger.info("removing shard: %s, after moving primary", shard_name) - removal_info = self.client.admin.command("removeShard", shard_name) - self._log_removal_info(removal_info, shard_name) + # MongoDB docs says to re-run removeShard after running movePrimary + logger.info("removing shard: %s, after moving primary", shard_name) + removal_info = self.client.admin.command("removeShard", shard_name) + self._log_removal_info(removal_info, shard_name) + + if shard_name in self.get_shard_members(): + logger.info("Shard %s is still present in sharded cluster.", shard_name) + raise NotDrainedError() def _is_shard_draining(self, shard_name: str) -> bool: """Reports if a given shard is currently in the draining state. @@ -361,6 +365,12 @@ def is_shard_aware(self, shard_name: str) -> bool: def _retrieve_remaining_chunks(self, removal_info) -> int: """Parses the remaining chunks to remove from removeShard command.""" + # when chunks have finished draining, remaining chunks is still in the removal info, but + # marked as 0. If "remaining" is not present, in removal_info then the shard is not yet + # draining + if "remaining" not in removal_info: + raise NotDrainedError() + return removal_info["remaining"]["chunks"] if "remaining" in removal_info else 0 def _move_primary(self, databases_to_move: List[str], old_primary: str) -> None: diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index ff5526e68..fe19bfa2b 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -52,7 +52,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 3 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) @@ -241,6 +241,7 @@ def remove_shards(self, departed_shard_id): raises: PyMongoError, NotReadyError """ + retry_removal = False with MongosConnection(self.charm.mongos_config) as mongo: cluster_shards = mongo.get_shard_members() relation_shards = self._get_shards_from_relations(departed_shard_id) @@ -250,11 +251,19 @@ def remove_shards(self, departed_shard_id): self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}") logger.info("Attempting to removing shard: %s", shard) mongo.remove_shard(shard) + except NotReadyError: + logger.info("Unable to remove shard: %s another shard is draining", shard) + # to guarantee that shard that the currently draining shard, gets re-processed, + # do not raise immediately, instead at the end of removal processing. + retry_removal = True except ShardNotInClusterError: logger.info( "Shard to remove is not in sharded cluster. It has been successfully removed." ) + if retry_removal: + raise ShardNotInClusterError + def update_credentials(self, key: str, value: str) -> None: """Sends new credentials, for a key value pair across all shards.""" for relation in self.charm.model.relations[self.relation_name]: @@ -432,6 +441,11 @@ def _on_relation_changed(self, event): # shards rely on the config server for secrets relation_data = event.relation.data[event.app] + if not relation_data.get(KEYFILE_KEY): + event.defer() + self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server") + return + self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes). @@ -446,6 +460,10 @@ def _on_relation_changed(self, event): return # TODO Future work, see if needed to check for all units restarted / primary elected + if not relation_data.get(OPERATOR_PASSWORD_KEY): + event.defer() + self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server") + return try: self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY)) @@ -516,12 +534,13 @@ def wait_for_draining(self, mongos_hosts: List[str]): while not drained: try: # no need to continuously check and abuse resources while shard is draining - time.sleep(10) + time.sleep(60) drained = self.drained(mongos_hosts, self.charm.app.name) self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") draining_status = ( "Shard is still draining" if not drained else "Shard is fully drained." ) + self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") logger.debug(draining_status) except PyMongoError as e: logger.error("Error occurred while draining shard: %s", e) diff --git a/tests/integration/sharding_tests/helpers.py b/tests/integration/sharding_tests/helpers.py index 868d45b1e..d8f9f8ae5 100644 --- a/tests/integration/sharding_tests/helpers.py +++ b/tests/integration/sharding_tests/helpers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. +from typing import List, Optional from urllib.parse import quote_plus from pymongo import MongoClient @@ -43,3 +44,38 @@ def verify_data_mongodb(client, db_name, coll_name, key, value) -> bool: test_collection = db[coll_name] query = test_collection.find({}, {key: 1}) return query[0][key] == value + + +def get_cluster_shards(mongos_client) -> set: + """Returns a set of the shard members.""" + shard_list = mongos_client.admin.command("listShards") + curr_members = [member["host"].split("/")[0] for member in shard_list["shards"]] + return set(curr_members) + + +def get_databases_for_shard(mongos_client, shard_name) -> Optional[List[str]]: + """Returns the databases hosted on the given shard.""" + config_db = mongos_client["config"] + if "databases" not in config_db.list_collection_names(): + return None + + databases_collection = config_db["databases"] + + if databases_collection is None: + return + + return databases_collection.distinct("_id", {"primary": shard_name}) + + +def has_correct_shards(mongos_client, expected_shards: List[str]) -> bool: + """Returns true if the cluster config has the expected shards.""" + shard_names = get_cluster_shards(mongos_client) + return shard_names == set(expected_shards) + + +def shard_has_databases( + mongos_client, shard_name: str, expected_databases_on_shard: List[str] +) -> bool: + """Returns true if the provided shard is a primary for the provided databases.""" + databases_on_shard = get_databases_for_shard(mongos_client, shard_name=shard_name) + return set(databases_on_shard) == set(expected_databases_on_shard) diff --git a/tests/integration/sharding_tests/test_sharding.py b/tests/integration/sharding_tests/test_sharding.py index 9b3c44069..562a3ec01 100644 --- a/tests/integration/sharding_tests/test_sharding.py +++ b/tests/integration/sharding_tests/test_sharding.py @@ -6,10 +6,17 @@ import pytest from pytest_operator.plugin import OpsTest -from .helpers import generate_mongodb_client, verify_data_mongodb, write_data_to_mongodb +from .helpers import ( + generate_mongodb_client, + has_correct_shards, + shard_has_databases, + verify_data_mongodb, + write_data_to_mongodb, +) SHARD_ONE_APP_NAME = "shard-one" SHARD_TWO_APP_NAME = "shard-two" +SHARD_THREE_APP_NAME = "shard-three" SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME] CONFIG_SERVER_APP_NAME = "config-server-one" SHARD_REL_NAME = "sharding" @@ -37,14 +44,17 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: await ops_test.model.deploy( my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_TWO_APP_NAME ) + await ops_test.model.deploy( + my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_THREE_APP_NAME + ) - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME], - idle_period=20, - raise_on_blocked=False, - timeout=TIMEOUT, - ) + await ops_test.model.wait_for_idle( + apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_THREE_APP_NAME], + idle_period=20, + raise_on_blocked=False, + timeout=TIMEOUT, + raise_on_error=False, + ) # verify that Charmed MongoDB is blocked and reports incorrect credentials await asyncio.gather( @@ -81,16 +91,36 @@ async def test_cluster_active(ops_test: OpsTest) -> None: f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", ) + await ops_test.model.integrate( + f"{SHARD_THREE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) - async with ops_test.fast_forward(): - await ops_test.model.wait_for_idle( - apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME], - idle_period=20, - status="active", - timeout=TIMEOUT, - ) + await ops_test.model.wait_for_idle( + apps=[ + CONFIG_SERVER_APP_NAME, + SHARD_ONE_APP_NAME, + SHARD_TWO_APP_NAME, + SHARD_THREE_APP_NAME, + ], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + mongos_client = await generate_mongodb_client( + ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + + # verify sharded cluster config + assert has_correct_shards( + mongos_client, + expected_shards=[SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME], + ), "Config server did not process config properly" +@pytest.mark.abort_on_fail async def test_sharding(ops_test: OpsTest) -> None: """Tests writing data to mongos gets propagated to shards.""" # write data to mongos on both shards. @@ -98,46 +128,203 @@ async def test_sharding(ops_test: OpsTest) -> None: ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True ) - # write data to shard one + # write data to shard two write_data_to_mongodb( mongos_client, db_name="animals_database_1", coll_name="horses", content={"horse-breed": "unicorn", "real": True}, ) - mongos_client.admin.command("movePrimary", "animals_database_1", to=SHARD_ONE_APP_NAME) + mongos_client.admin.command("movePrimary", "animals_database_1", to=SHARD_TWO_APP_NAME) - # write data to shard two + # write data to shard three write_data_to_mongodb( mongos_client, db_name="animals_database_2", coll_name="horses", content={"horse-breed": "pegasus", "real": True}, ) - mongos_client.admin.command("movePrimary", "animals_database_2", to=SHARD_TWO_APP_NAME) + mongos_client.admin.command("movePrimary", "animals_database_2", to=SHARD_THREE_APP_NAME) - # log into shard 1 verify data - shard_one_client = await generate_mongodb_client( - ops_test, app_name=SHARD_ONE_APP_NAME, mongos=False + # log into shard two verify data + shard_two_client = await generate_mongodb_client( + ops_test, app_name=SHARD_TWO_APP_NAME, mongos=False ) has_correct_data = verify_data_mongodb( - shard_one_client, + shard_two_client, db_name="animals_database_1", coll_name="horses", key="horse-breed", value="unicorn", ) - assert has_correct_data, "data not written to shard-one" + assert has_correct_data, "data not written to shard-two" # log into shard 2 verify data - shard_two_client = await generate_mongodb_client( - ops_test, app_name=SHARD_TWO_APP_NAME, mongos=False + shard_three_client = await generate_mongodb_client( + ops_test, app_name=SHARD_THREE_APP_NAME, mongos=False ) has_correct_data = verify_data_mongodb( - shard_two_client, + shard_three_client, db_name="animals_database_2", coll_name="horses", key="horse-breed", value="pegasus", ) - assert has_correct_data, "data not written to shard-two" + assert has_correct_data, "data not written to shard-three" + + +async def test_shard_removal(ops_test: OpsTest) -> None: + """Test shard removal. + + This test also verifies that: + - Databases that are using this shard as a primary are moved. + - The balancer is turned back on if turned off. + - Config server supp orts removing multiple shards. + """ + # turn off balancer. + mongos_client = await generate_mongodb_client( + ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + mongos_client.admin.command("balancerStop") + balancer_state = mongos_client.admin.command("balancerStatus") + assert balancer_state["mode"] == "off", "balancer was not successfully turned off" + + # remove two shards at the same time + await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation( + f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation( + f"{SHARD_THREE_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[ + CONFIG_SERVER_APP_NAME, + SHARD_ONE_APP_NAME, + SHARD_TWO_APP_NAME, + SHARD_THREE_APP_NAME, + ], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + # TODO future PR: assert statuses are correct + + # verify that config server turned back on the balancer + balancer_state = mongos_client.admin.command("balancerStatus") + assert balancer_state["mode"] != "off", "balancer not turned back on from config server" + + # verify sharded cluster config + assert has_correct_shards( + mongos_client, expected_shards=[SHARD_ONE_APP_NAME] + ), "Config server did not process config properly" + + # verify no data lost + assert shard_has_databases( + mongos_client, + shard_name=SHARD_ONE_APP_NAME, + expected_databases_on_shard=["animals_database_1", "animals_database_2"], + ), "Not all databases on final shard" + + +async def test_removal_of_non_primary_shard(ops_test: OpsTest): + """Tests safe removal of a shard that is not primary.""" + # add back a shard so we can safely remove a shard. + await ops_test.model.integrate( + f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[ + CONFIG_SERVER_APP_NAME, + SHARD_ONE_APP_NAME, + SHARD_TWO_APP_NAME, + SHARD_THREE_APP_NAME, + ], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation( + f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.wait_for_idle( + apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + mongos_client = await generate_mongodb_client( + ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + + # verify sharded cluster config + assert has_correct_shards( + mongos_client, expected_shards=[SHARD_ONE_APP_NAME] + ), "Config server did not process config properly" + + # verify no data lost + assert shard_has_databases( + mongos_client, + shard_name=SHARD_ONE_APP_NAME, + expected_databases_on_shard=["animals_database_1", "animals_database_2"], + ), "Not all databases on final shard" + + +async def test_unconventual_shard_removal(ops_test: OpsTest): + """Tests that removing a shard application safely drains data. + + It is preferred that users remove-relations instead of removing shard applications. But we do + support removing shard applications in a safe way. + """ + # add back a shard so we can safely remove a shard. + await ops_test.model.integrate( + f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}", + f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}", + ) + + await ops_test.model.applications[SHARD_TWO_APP_NAME].destroy_units(f"{SHARD_TWO_APP_NAME}/0") + await ops_test.model.wait_for_idle( + apps=[SHARD_TWO_APP_NAME], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + await ops_test.model.remove_application(SHARD_TWO_APP_NAME, block_until_done=True) + + await ops_test.model.wait_for_idle( + apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME], + idle_period=20, + status="active", + timeout=TIMEOUT, + raise_on_error=False, + ) + + mongos_client = await generate_mongodb_client( + ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True + ) + + # verify sharded cluster config + assert has_correct_shards( + mongos_client, expected_shards=[SHARD_ONE_APP_NAME] + ), "Config server did not process config properly" + + # verify no data lost + assert shard_has_databases( + mongos_client, + shard_name=SHARD_ONE_APP_NAME, + expected_databases_on_shard=["animals_database_1", "animals_database_2"], + ), "Not all databases on final shard"