From fc891d5dbd471876dff99ae9aeeab7c36ae7a659 Mon Sep 17 00:00:00 2001
From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com>
Date: Thu, 2 Nov 2023 09:17:07 +0100
Subject: [PATCH] [DPE-2778] Shard removal tests (#283)

## Issue
No tests for shard removal

## Solution
Add tests for various removal scenarios

## Extras
1. These tests highlighted an issue with slow drainage. A large TIMEOUT
period has been added to the tests and an issue (#282 ) has been added
to the repo
2. Wait for relation_changed to be given both key and ops password, as
these seem to not always be given from the config server on
relation-joined. [discussed
here](https://chat.charmhub.io/charmhub/pl/prmxufjyfbnt9qrzdrxoxkqz3e)
---
 .github/workflows/ci.yaml                     |   2 +-
 lib/charms/mongodb/v1/mongos.py               |  18 +-
 lib/charms/mongodb/v1/shards_interface.py     |  23 +-
 tests/integration/sharding_tests/helpers.py   |  36 +++
 .../sharding_tests/test_sharding.py           | 243 ++++++++++++++++--
 5 files changed, 287 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3c0bdab19..9e050e44c 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -67,7 +67,7 @@ jobs:
 
   build:
     name: Build charms
-    uses: canonical/data-platform-workflows/.github/workflows/build_charms_with_cache.yaml@v2
+    uses: canonical/data-platform-workflows/.github/workflows/build_charms_with_cache.yaml@v5.1.2
 
   integration-test:
     strategy:
diff --git a/lib/charms/mongodb/v1/mongos.py b/lib/charms/mongodb/v1/mongos.py
index 774b3da14..a8d5455d6 100644
--- a/lib/charms/mongodb/v1/mongos.py
+++ b/lib/charms/mongodb/v1/mongos.py
@@ -235,10 +235,14 @@ def remove_shard(self, shard_name: str) -> None:
             )
             self._move_primary(databases_using_shard_as_primary, old_primary=shard_name)
 
-        # MongoDB docs says to re-run removeShard after running movePrimary
-        logger.info("removing shard: %s, after moving primary", shard_name)
-        removal_info = self.client.admin.command("removeShard", shard_name)
-        self._log_removal_info(removal_info, shard_name)
+            # MongoDB docs says to re-run removeShard after running movePrimary
+            logger.info("removing shard: %s, after moving primary", shard_name)
+            removal_info = self.client.admin.command("removeShard", shard_name)
+            self._log_removal_info(removal_info, shard_name)
+
+        if shard_name in self.get_shard_members():
+            logger.info("Shard %s is still present in sharded cluster.", shard_name)
+            raise NotDrainedError()
 
     def _is_shard_draining(self, shard_name: str) -> bool:
         """Reports if a given shard is currently in the draining state.
@@ -361,6 +365,12 @@ def is_shard_aware(self, shard_name: str) -> bool:
 
     def _retrieve_remaining_chunks(self, removal_info) -> int:
         """Parses the remaining chunks to remove from removeShard command."""
+        # when chunks have finished draining, remaining chunks is still in the removal info, but
+        # marked as 0. If "remaining" is not present, in removal_info then the shard is not yet
+        # draining
+        if "remaining" not in removal_info:
+            raise NotDrainedError()
+
         return removal_info["remaining"]["chunks"] if "remaining" in removal_info else 0
 
     def _move_primary(self, databases_to_move: List[str], old_primary: str) -> None:
diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py
index ff5526e68..fe19bfa2b 100644
--- a/lib/charms/mongodb/v1/shards_interface.py
+++ b/lib/charms/mongodb/v1/shards_interface.py
@@ -52,7 +52,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 2
+LIBPATCH = 3
 KEYFILE_KEY = "key-file"
 HOSTS_KEY = "host"
 OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
@@ -241,6 +241,7 @@ def remove_shards(self, departed_shard_id):
 
         raises: PyMongoError, NotReadyError
         """
+        retry_removal = False
         with MongosConnection(self.charm.mongos_config) as mongo:
             cluster_shards = mongo.get_shard_members()
             relation_shards = self._get_shards_from_relations(departed_shard_id)
@@ -250,11 +251,19 @@ def remove_shards(self, departed_shard_id):
                     self.charm.unit.status = MaintenanceStatus(f"Draining shard {shard}")
                     logger.info("Attempting to removing shard: %s", shard)
                     mongo.remove_shard(shard)
+                except NotReadyError:
+                    logger.info("Unable to remove shard: %s another shard is draining", shard)
+                    # to guarantee that shard that the currently draining shard, gets re-processed,
+                    # do not raise immediately, instead at the end of removal processing.
+                    retry_removal = True
                 except ShardNotInClusterError:
                     logger.info(
                         "Shard to remove is not in sharded cluster. It has been successfully removed."
                     )
 
+        if retry_removal:
+            raise ShardNotInClusterError
+
     def update_credentials(self, key: str, value: str) -> None:
         """Sends new credentials, for a key value pair across all shards."""
         for relation in self.charm.model.relations[self.relation_name]:
@@ -432,6 +441,11 @@ def _on_relation_changed(self, event):
 
         # shards rely on the config server for secrets
         relation_data = event.relation.data[event.app]
+        if not relation_data.get(KEYFILE_KEY):
+            event.defer()
+            self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server")
+            return
+
         self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY))
 
         # restart on high loaded databases can be very slow (e.g. up to 10-20 minutes).
@@ -446,6 +460,10 @@ def _on_relation_changed(self, event):
             return
 
         # TODO Future work, see if needed to check for all units restarted / primary elected
+        if not relation_data.get(OPERATOR_PASSWORD_KEY):
+            event.defer()
+            self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server")
+            return
 
         try:
             self.update_operator_password(new_password=relation_data.get(OPERATOR_PASSWORD_KEY))
@@ -516,12 +534,13 @@ def wait_for_draining(self, mongos_hosts: List[str]):
         while not drained:
             try:
                 # no need to continuously check and abuse resources while shard is draining
-                time.sleep(10)
+                time.sleep(60)
                 drained = self.drained(mongos_hosts, self.charm.app.name)
                 self.charm.unit.status = MaintenanceStatus("Draining shard from cluster")
                 draining_status = (
                     "Shard is still draining" if not drained else "Shard is fully drained."
                 )
+                self.charm.unit.status = MaintenanceStatus("Draining shard from cluster")
                 logger.debug(draining_status)
             except PyMongoError as e:
                 logger.error("Error occurred while draining shard: %s", e)
diff --git a/tests/integration/sharding_tests/helpers.py b/tests/integration/sharding_tests/helpers.py
index 868d45b1e..d8f9f8ae5 100644
--- a/tests/integration/sharding_tests/helpers.py
+++ b/tests/integration/sharding_tests/helpers.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright 2023 Canonical Ltd.
 # See LICENSE file for licensing details.
+from typing import List, Optional
 from urllib.parse import quote_plus
 
 from pymongo import MongoClient
@@ -43,3 +44,38 @@ def verify_data_mongodb(client, db_name, coll_name, key, value) -> bool:
     test_collection = db[coll_name]
     query = test_collection.find({}, {key: 1})
     return query[0][key] == value
+
+
+def get_cluster_shards(mongos_client) -> set:
+    """Returns a set of the shard members."""
+    shard_list = mongos_client.admin.command("listShards")
+    curr_members = [member["host"].split("/")[0] for member in shard_list["shards"]]
+    return set(curr_members)
+
+
+def get_databases_for_shard(mongos_client, shard_name) -> Optional[List[str]]:
+    """Returns the databases hosted on the given shard."""
+    config_db = mongos_client["config"]
+    if "databases" not in config_db.list_collection_names():
+        return None
+
+    databases_collection = config_db["databases"]
+
+    if databases_collection is None:
+        return
+
+    return databases_collection.distinct("_id", {"primary": shard_name})
+
+
+def has_correct_shards(mongos_client, expected_shards: List[str]) -> bool:
+    """Returns true if the cluster config has the expected shards."""
+    shard_names = get_cluster_shards(mongos_client)
+    return shard_names == set(expected_shards)
+
+
+def shard_has_databases(
+    mongos_client, shard_name: str, expected_databases_on_shard: List[str]
+) -> bool:
+    """Returns true if the provided shard is a primary for the provided databases."""
+    databases_on_shard = get_databases_for_shard(mongos_client, shard_name=shard_name)
+    return set(databases_on_shard) == set(expected_databases_on_shard)
diff --git a/tests/integration/sharding_tests/test_sharding.py b/tests/integration/sharding_tests/test_sharding.py
index 9b3c44069..562a3ec01 100644
--- a/tests/integration/sharding_tests/test_sharding.py
+++ b/tests/integration/sharding_tests/test_sharding.py
@@ -6,10 +6,17 @@
 import pytest
 from pytest_operator.plugin import OpsTest
 
-from .helpers import generate_mongodb_client, verify_data_mongodb, write_data_to_mongodb
+from .helpers import (
+    generate_mongodb_client,
+    has_correct_shards,
+    shard_has_databases,
+    verify_data_mongodb,
+    write_data_to_mongodb,
+)
 
 SHARD_ONE_APP_NAME = "shard-one"
 SHARD_TWO_APP_NAME = "shard-two"
+SHARD_THREE_APP_NAME = "shard-three"
 SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME]
 CONFIG_SERVER_APP_NAME = "config-server-one"
 SHARD_REL_NAME = "sharding"
@@ -37,14 +44,17 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
     await ops_test.model.deploy(
         my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_TWO_APP_NAME
     )
+    await ops_test.model.deploy(
+        my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_THREE_APP_NAME
+    )
 
-    async with ops_test.fast_forward():
-        await ops_test.model.wait_for_idle(
-            apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME],
-            idle_period=20,
-            raise_on_blocked=False,
-            timeout=TIMEOUT,
-        )
+    await ops_test.model.wait_for_idle(
+        apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_THREE_APP_NAME],
+        idle_period=20,
+        raise_on_blocked=False,
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
 
     # verify that Charmed MongoDB is blocked and reports incorrect credentials
     await asyncio.gather(
@@ -81,16 +91,36 @@ async def test_cluster_active(ops_test: OpsTest) -> None:
         f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
         f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
     )
+    await ops_test.model.integrate(
+        f"{SHARD_THREE_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
 
-    async with ops_test.fast_forward():
-        await ops_test.model.wait_for_idle(
-            apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME],
-            idle_period=20,
-            status="active",
-            timeout=TIMEOUT,
-        )
+    await ops_test.model.wait_for_idle(
+        apps=[
+            CONFIG_SERVER_APP_NAME,
+            SHARD_ONE_APP_NAME,
+            SHARD_TWO_APP_NAME,
+            SHARD_THREE_APP_NAME,
+        ],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    mongos_client = await generate_mongodb_client(
+        ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
+    )
+
+    # verify sharded cluster config
+    assert has_correct_shards(
+        mongos_client,
+        expected_shards=[SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME],
+    ), "Config server did not process config properly"
 
 
+@pytest.mark.abort_on_fail
 async def test_sharding(ops_test: OpsTest) -> None:
     """Tests writing data to mongos gets propagated to shards."""
     # write data to mongos on both shards.
@@ -98,46 +128,203 @@ async def test_sharding(ops_test: OpsTest) -> None:
         ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
     )
 
-    # write data to shard one
+    # write data to shard two
     write_data_to_mongodb(
         mongos_client,
         db_name="animals_database_1",
         coll_name="horses",
         content={"horse-breed": "unicorn", "real": True},
     )
-    mongos_client.admin.command("movePrimary", "animals_database_1", to=SHARD_ONE_APP_NAME)
+    mongos_client.admin.command("movePrimary", "animals_database_1", to=SHARD_TWO_APP_NAME)
 
-    # write data to shard two
+    # write data to shard three
     write_data_to_mongodb(
         mongos_client,
         db_name="animals_database_2",
         coll_name="horses",
         content={"horse-breed": "pegasus", "real": True},
     )
-    mongos_client.admin.command("movePrimary", "animals_database_2", to=SHARD_TWO_APP_NAME)
+    mongos_client.admin.command("movePrimary", "animals_database_2", to=SHARD_THREE_APP_NAME)
 
-    # log into shard 1 verify data
-    shard_one_client = await generate_mongodb_client(
-        ops_test, app_name=SHARD_ONE_APP_NAME, mongos=False
+    # log into shard two verify data
+    shard_two_client = await generate_mongodb_client(
+        ops_test, app_name=SHARD_TWO_APP_NAME, mongos=False
     )
     has_correct_data = verify_data_mongodb(
-        shard_one_client,
+        shard_two_client,
         db_name="animals_database_1",
         coll_name="horses",
         key="horse-breed",
         value="unicorn",
     )
-    assert has_correct_data, "data not written to shard-one"
+    assert has_correct_data, "data not written to shard-two"
 
     # log into shard 2 verify data
-    shard_two_client = await generate_mongodb_client(
-        ops_test, app_name=SHARD_TWO_APP_NAME, mongos=False
+    shard_three_client = await generate_mongodb_client(
+        ops_test, app_name=SHARD_THREE_APP_NAME, mongos=False
     )
     has_correct_data = verify_data_mongodb(
-        shard_two_client,
+        shard_three_client,
         db_name="animals_database_2",
         coll_name="horses",
         key="horse-breed",
         value="pegasus",
     )
-    assert has_correct_data, "data not written to shard-two"
+    assert has_correct_data, "data not written to shard-three"
+
+
+async def test_shard_removal(ops_test: OpsTest) -> None:
+    """Test shard removal.
+
+    This test also verifies that:
+    - Databases that are using this shard as a primary are moved.
+    - The balancer is turned back on if turned off.
+    - Config server supp    orts removing multiple shards.
+    """
+    # turn off balancer.
+    mongos_client = await generate_mongodb_client(
+        ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
+    )
+    mongos_client.admin.command("balancerStop")
+    balancer_state = mongos_client.admin.command("balancerStatus")
+    assert balancer_state["mode"] == "off", "balancer was not successfully turned off"
+
+    # remove two shards at the same time
+    await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation(
+        f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
+    await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation(
+        f"{SHARD_THREE_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
+
+    await ops_test.model.wait_for_idle(
+        apps=[
+            CONFIG_SERVER_APP_NAME,
+            SHARD_ONE_APP_NAME,
+            SHARD_TWO_APP_NAME,
+            SHARD_THREE_APP_NAME,
+        ],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    # TODO future PR: assert statuses are correct
+
+    # verify that config server turned back on the balancer
+    balancer_state = mongos_client.admin.command("balancerStatus")
+    assert balancer_state["mode"] != "off", "balancer not turned back on from config server"
+
+    # verify sharded cluster config
+    assert has_correct_shards(
+        mongos_client, expected_shards=[SHARD_ONE_APP_NAME]
+    ), "Config server did not process config properly"
+
+    # verify no data lost
+    assert shard_has_databases(
+        mongos_client,
+        shard_name=SHARD_ONE_APP_NAME,
+        expected_databases_on_shard=["animals_database_1", "animals_database_2"],
+    ), "Not all databases on final shard"
+
+
+async def test_removal_of_non_primary_shard(ops_test: OpsTest):
+    """Tests safe removal of a shard that is not primary."""
+    # add back a shard so we can safely remove a shard.
+    await ops_test.model.integrate(
+        f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
+
+    await ops_test.model.wait_for_idle(
+        apps=[
+            CONFIG_SERVER_APP_NAME,
+            SHARD_ONE_APP_NAME,
+            SHARD_TWO_APP_NAME,
+            SHARD_THREE_APP_NAME,
+        ],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    await ops_test.model.applications[CONFIG_SERVER_APP_NAME].remove_relation(
+        f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
+
+    await ops_test.model.wait_for_idle(
+        apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    mongos_client = await generate_mongodb_client(
+        ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
+    )
+
+    # verify sharded cluster config
+    assert has_correct_shards(
+        mongos_client, expected_shards=[SHARD_ONE_APP_NAME]
+    ), "Config server did not process config properly"
+
+    # verify no data lost
+    assert shard_has_databases(
+        mongos_client,
+        shard_name=SHARD_ONE_APP_NAME,
+        expected_databases_on_shard=["animals_database_1", "animals_database_2"],
+    ), "Not all databases on final shard"
+
+
+async def test_unconventual_shard_removal(ops_test: OpsTest):
+    """Tests that removing a shard application safely drains data.
+
+    It is preferred that users remove-relations instead of removing shard applications. But we do
+    support removing shard applications in a safe way.
+    """
+    # add back a shard so we can safely remove a shard.
+    await ops_test.model.integrate(
+        f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
+        f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
+    )
+
+    await ops_test.model.applications[SHARD_TWO_APP_NAME].destroy_units(f"{SHARD_TWO_APP_NAME}/0")
+    await ops_test.model.wait_for_idle(
+        apps=[SHARD_TWO_APP_NAME],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    await ops_test.model.remove_application(SHARD_TWO_APP_NAME, block_until_done=True)
+
+    await ops_test.model.wait_for_idle(
+        apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME],
+        idle_period=20,
+        status="active",
+        timeout=TIMEOUT,
+        raise_on_error=False,
+    )
+
+    mongos_client = await generate_mongodb_client(
+        ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
+    )
+
+    # verify sharded cluster config
+    assert has_correct_shards(
+        mongos_client, expected_shards=[SHARD_ONE_APP_NAME]
+    ), "Config server did not process config properly"
+
+    # verify no data lost
+    assert shard_has_databases(
+        mongos_client,
+        shard_name=SHARD_ONE_APP_NAME,
+        expected_databases_on_shard=["animals_database_1", "animals_database_2"],
+    ), "Not all databases on final shard"