From eb9c9e046050be2cbd3b8746913405a7463ef0d5 Mon Sep 17 00:00:00 2001
From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com>
Date: Tue, 11 Jun 2024 20:00:59 +0200
Subject: [PATCH] New upgrade implementation (#418)

## Issue
Current upgrade implementation should be replaced with v2

## Solution
Replace current upgrade implementation with v2

---------

Co-authored-by: Carl Csaposs <carl.csaposs@canonical.com>
Co-authored-by: Mehdi Bendriss <bendrissmehdi@gmail.com>
---
 actions.yaml                                  |    8 +-
 charm_version                                 |    1 +
 charmcraft.yaml                               |    3 +
 lib/charms/data_platform_libs/v0/upgrade.py   | 1091 -----------------
 .../mongodb/v0/config_server_interface.py     |   17 +-
 lib/charms/mongodb/v0/mongodb.py              |   58 +
 lib/charms/mongodb/v0/mongodb_tls.py          |   36 +-
 lib/charms/mongodb/v1/mongodb_backups.py      |   32 +-
 lib/charms/mongodb/v1/mongodb_provider.py     |   12 +-
 lib/charms/mongodb/v1/shards_interface.py     |   88 +-
 metadata.yaml                                 |    5 +-
 pyproject.toml                                |    3 +-
 requirements.txt                              |    2 +-
 src/charm.py                                  |  137 ++-
 src/config.py                                 |    9 -
 src/events/upgrade.py                         |  282 -----
 src/status_exception.py                       |   14 +
 src/upgrades/machine_upgrade.py               |  198 +++
 src/upgrades/mongodb_upgrade.py               |  432 +++++++
 src/upgrades/upgrade.py                       |  302 +++++
 .../integration/backup_tests/test_backups.py  |   53 +-
 tests/integration/helpers.py                  |  138 ++-
 .../sharding_tests/test_sharding.py           |   57 +-
 .../sharding_tests/test_sharding_relations.py |  131 +-
 .../sharding_tests/test_sharding_tls.py       |   43 +-
 tests/integration/test_charm.py               |    3 +
 tests/integration/upgrade/test_upgrade.py     |    7 +-
 tests/unit/test_charm.py                      |    9 +-
 tests/unit/test_upgrade.py                    |   94 +-
 tox.ini                                       |   28 +-
 workload_version                              |    1 +
 31 files changed, 1506 insertions(+), 1788 deletions(-)
 create mode 100644 charm_version
 delete mode 100644 lib/charms/data_platform_libs/v0/upgrade.py
 delete mode 100644 src/events/upgrade.py
 create mode 100644 src/status_exception.py
 create mode 100644 src/upgrades/machine_upgrade.py
 create mode 100644 src/upgrades/mongodb_upgrade.py
 create mode 100644 src/upgrades/upgrade.py
 create mode 100644 workload_version

diff --git a/actions.yaml b/actions.yaml
index 2294d6124..d6cfed794 100644
--- a/actions.yaml
+++ b/actions.yaml
@@ -57,4 +57,10 @@ set-tls-private-key:
       description: The content of private key for internal communications with clients. Content will be auto-generated if this option is not specified.
 
 pre-upgrade-check:
-  description: Run necessary pre-upgrade checks before executing a charm upgrade.
+  description: Check if charm is ready to upgrade
+
+resume-upgrade:
+  description: Upgrade remaining units (after you manually verified that upgraded units are healthy).
+
+force-upgrade:
+  description: Force upgrade of this unit.
diff --git a/charm_version b/charm_version
new file mode 100644
index 000000000..d00491fd7
--- /dev/null
+++ b/charm_version
@@ -0,0 +1 @@
+1
diff --git a/charmcraft.yaml b/charmcraft.yaml
index 498a4690b..86437bb19 100644
--- a/charmcraft.yaml
+++ b/charmcraft.yaml
@@ -20,3 +20,6 @@ parts:
       - libssl-dev
       - rustc
       - cargo
+    prime:
+      - charm_version
+      - workload_version
diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py
deleted file mode 100644
index ef74644de..000000000
--- a/lib/charms/data_platform_libs/v0/upgrade.py
+++ /dev/null
@@ -1,1091 +0,0 @@
-# Copyright 2023 Canonical Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""Library to manage in-place upgrades for charms running on VMs and K8s.
-
-This library contains handlers for `upgrade` relation events used to coordinate
-between units in an application during a `juju refresh`, as well as `Pydantic` models
-for instantiating, validating and comparing dependencies.
-
-An upgrade on VMs is initiated with the command `juju refresh`. Once executed, the following
-events are emitted to each unit at random:
-    - `upgrade-charm`
-    - `config-changed`
-    - `leader-settings-changed` - Non-leader only
-
-Charm authors can implement the classes defined in this library to streamline the process of
-coordinating which unit updates when, achieved through updating of unit-data `state` throughout.
-
-At a high-level, the upgrade steps are as follows:
-    - Run pre-checks on the cluster to confirm it is safe to upgrade
-    - Create stack of unit.ids, to serve as the upgrade order (generally workload leader is last)
-    - Start the upgrade by issuing a Juju CLI command
-    - The unit at the top of the stack gets permission to upgrade
-    - The unit handles the upgrade and restarts their service
-    - Repeat, until all units have restarted
-
-### Usage by charm authors
-
-#### `upgrade` relation
-
-Charm authors must implement an additional peer-relation.
-
-As this library uses relation data exchanged between units to coordinate, charm authors
-need to add a new relation interface. The relation name does not matter.
-
-`metadata.yaml`
-```yaml
-peers:
-  upgrade:
-    interface: upgrade
-```
-
-#### Dependencies JSON/Dict
-
-Charm authors must implement a dict object tracking current charm versions, requirements + upgradability.
-
-Many workload versions may be incompatible with older/newer versions. This same idea also can apply to
-charm or snap versions. Workloads with required related applications (e.g Kafka + ZooKeeper) also need to
-ensure their versions are compatible during an upgrade, to avoid cluster failure.
-
-As such, it is necessasry to freeze any dependencies within each published charm. An example of this could
-be creating a `DEPENDENCIES` dict within the charm code, with the following structure:
-
-`src/literals.py`
-```python
-DEPENDENCIES = {
-    "kafka_charm": {
-        "dependencies": {"zookeeper": ">50"},
-        "name": "kafka",
-        "upgrade_supported": ">90",
-        "version": "100",
-    },
-    "kafka_service": {
-        "dependencies": {"zookeeper": "^3"},
-        "name": "kafka",
-        "upgrade_supported": ">=0.8",
-        "version": "3.3.2",
-    },
-}
-```
-
-The first-level key names are arbitrary labels for tracking what those versions+dependencies are for.
-The `dependencies` second-level values are a key-value map of any required external applications,
-    and the versions this packaged charm can support.
-The `upgrade_suppported` second-level values are requirements from which an in-place upgrade can be
-    supported by the charm.
-The `version` second-level values correspond to the current version of this packaged charm.
-
-Any requirements comply with [`poetry`'s dependency specifications](https://python-poetry.org/docs/dependency-specification/#caret-requirements).
-
-### Dependency Model
-
-Charm authors must implement their own class inheriting from `DependencyModel`.
-
-Using a `Pydantic` model to instantiate the aforementioned `DEPENDENCIES` dict gives stronger type safety and additional
-layers of validation.
-
-Implementation just needs to ensure that the top-level key names from `DEPENDENCIES` are defined as attributed in the model.
-
-`src/upgrade.py`
-```python
-from pydantic import BaseModel
-
-class KafkaDependenciesModel(BaseModel):
-    kafka_charm: DependencyModel
-    kafka_service: DependencyModel
-```
-
-### Overrides for `DataUpgrade`
-
-Charm authors must define their own class, inheriting from `DataUpgrade`, overriding all required `abstractmethod`s.
-
-```python
-class ZooKeeperUpgrade(DataUpgrade):
-    def __init__(self, charm: "ZooKeeperUpgrade", **kwargs):
-        super().__init__(charm, **kwargs)
-        self.charm = charm
-```
-
-#### Implementation of `pre_upgrade_check()`
-
-Before upgrading a cluster, it's a good idea to check that it is stable and healthy before permitting it.
-Here, charm authors can validate upgrade safety through API calls, relation-data checks, etc.
-If any of these checks fail, raise `ClusterNotReadyError`.
-
-```python
-    @override
-    def pre_upgrade_check(self) -> None:
-        default_message = "Pre-upgrade check failed and cannot safely upgrade"
-        try:
-            if not self.client.members_broadcasting or not len(self.client.server_members) == len(
-                self.charm.cluster.peer_units
-            ):
-                raise ClusterNotReadyError(
-                    message=default_message,
-                    cause="Not all application units are connected and broadcasting in the quorum",
-                )
-
-            if self.client.members_syncing:
-                raise ClusterNotReadyError(
-                    message=default_message, cause="Some quorum members are syncing data"
-                )
-
-            if not self.charm.cluster.stable:
-                raise ClusterNotReadyError(
-                    message=default_message, cause="Charm has not finished initialising"
-                )
-
-        except QuorumLeaderNotFoundError:
-            raise ClusterNotReadyError(message=default_message, cause="Quorum leader not found")
-        except ConnectionClosedError:
-            raise ClusterNotReadyError(
-                message=default_message, cause="Unable to connect to the cluster"
-            )
-```
-
-#### Implementation of `build_upgrade_stack()` - VM ONLY
-
-Oftentimes, it is necessary to ensure that the workload leader is the last unit to upgrade,
-to ensure high-availability during the upgrade process.
-Here, charm authors can create a LIFO stack of unit.ids, represented as a list of unit.id strings,
-with the leader unit being at i[0].
-
-```python
-@override
-def build_upgrade_stack(self) -> list[int]:
-    upgrade_stack = []
-    for unit in self.charm.cluster.peer_units:
-        config = self.charm.cluster.unit_config(unit=unit)
-
-        # upgrade quorum leader last
-        if config["host"] == self.client.leader:
-            upgrade_stack.insert(0, int(config["unit_id"]))
-        else:
-            upgrade_stack.append(int(config["unit_id"]))
-
-    return upgrade_stack
-```
-
-#### Implementation of `_on_upgrade_granted()`
-
-On relation-changed events, each unit will check the current upgrade-stack persisted to relation data.
-If that unit is at the top of the stack, it will emit an `upgrade-granted` event, which must be handled.
-Here, workloads can be re-installed with new versions, checks can be made, data synced etc.
-If the new unit successfully rejoined the cluster, call `set_unit_completed()`.
-If the new unit failed to rejoin the cluster, call `set_unit_failed()`.
-
-NOTE - It is essential here to manually call `on_upgrade_changed` if the unit is the current leader.
-This ensures that the leader gets it's own relation-changed event, and updates the upgrade-stack for
-other units to follow suit.
-
-```python
-@override
-def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
-    self.charm.snap.stop_snap_service()
-
-    if not self.charm.snap.install():
-        logger.error("Unable to install ZooKeeper Snap")
-        self.set_unit_failed()
-        return None
-
-    logger.info(f"{self.charm.unit.name} upgrading service...")
-    self.charm.snap.restart_snap_service()
-
-    try:
-        logger.debug("Running post-upgrade check...")
-        self.pre_upgrade_check()
-
-        logger.debug("Marking unit completed...")
-        self.set_unit_completed()
-
-        # ensures leader gets it's own relation-changed when it upgrades
-        if self.charm.unit.is_leader():
-            logger.debug("Re-emitting upgrade-changed on leader...")
-            self.on_upgrade_changed(event)
-
-    except ClusterNotReadyError as e:
-        logger.error(e.cause)
-        self.set_unit_failed()
-```
-
-#### Implementation of `log_rollback_instructions()`
-
-If the upgrade fails, manual intervention may be required for cluster recovery.
-Here, charm authors can log out any necessary steps to take to recover from a failed upgrade.
-When a unit fails, this library will automatically log out this message.
-
-```python
-@override
-def log_rollback_instructions(self) -> None:
-    logger.error("Upgrade failed. Please run `juju refresh` to previous version.")
-```
-
-### Instantiating in the charm and deferring events
-
-Charm authors must add a class attribute for the child class of `DataUpgrade` in the main charm.
-They must also ensure that any non-upgrade related events that may be unsafe to handle during
-an upgrade, are deferred if the unit is not in the `idle` state - i.e not currently upgrading.
-
-```python
-class ZooKeeperCharm(CharmBase):
-    def __init__(self, *args):
-        super().__init__(*args)
-        self.upgrade = ZooKeeperUpgrade(
-            self,
-            relation_name = "upgrade",
-            substrate = "vm",
-            dependency_model=ZooKeeperDependencyModel(
-                **DEPENDENCIES
-            ),
-        )
-
-    def restart(self, event) -> None:
-        if not self.upgrade.state == "idle":
-            event.defer()
-            return None
-
-        self.restart_snap_service()
-```
-"""
-
-import json
-import logging
-from abc import ABC, abstractmethod
-from typing import Dict, List, Literal, Optional, Set, Tuple
-
-import poetry.core.constraints.version as poetry_version
-from ops.charm import (
-    ActionEvent,
-    CharmBase,
-    CharmEvents,
-    RelationCreatedEvent,
-    UpgradeCharmEvent,
-)
-from ops.framework import EventBase, EventSource, Object
-from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, Relation, Unit, WaitingStatus
-from pydantic import BaseModel, root_validator, validator
-
-# The unique Charmhub library identifier, never change it
-LIBID = "156258aefb79435a93d933409a8c8684"
-
-# Increment this major API version when introducing breaking changes
-LIBAPI = 0
-
-# Increment this PATCH version before using `charmcraft publish-lib` or reset
-# to 0 if you are raising the major API version
-LIBPATCH = 16
-
-PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]
-
-logger = logging.getLogger(__name__)
-
-# --- DEPENDENCY RESOLUTION FUNCTIONS ---
-
-
-def verify_requirements(version: str, requirement: str) -> bool:
-    """Verifies a specified version against defined constraint.
-
-    Supports Poetry version constraints
-    https://python-poetry.org/docs/dependency-specification/#version-constraints
-
-    Args:
-        version: the version currently in use
-        requirement: Poetry version constraint
-
-    Returns:
-        True if `version` meets defined `requirement`. Otherwise False
-    """
-    return poetry_version.parse_constraint(requirement).allows(
-        poetry_version.Version.parse(version)
-    )
-
-
-# --- DEPENDENCY MODEL TYPES ---
-
-
-class DependencyModel(BaseModel):
-    """Manager for a single dependency.
-
-    To be used as part of another model representing a collection of arbitrary dependencies.
-
-    Example::
-
-        class KafkaDependenciesModel(BaseModel):
-            kafka_charm: DependencyModel
-            kafka_service: DependencyModel
-
-        deps = {
-            "kafka_charm": {
-                "dependencies": {"zookeeper": ">5"},
-                "name": "kafka",
-                "upgrade_supported": ">5",
-                "version": "10",
-            },
-            "kafka_service": {
-                "dependencies": {"zookeeper": "^3.6"},
-                "name": "kafka",
-                "upgrade_supported": "~3.3",
-                "version": "3.3.2",
-            },
-        }
-
-        model = KafkaDependenciesModel(**deps)  # loading dict in to model
-
-        print(model.dict())  # exporting back validated deps
-    """
-
-    dependencies: Dict[str, str]
-    name: str
-    upgrade_supported: str
-    version: str
-
-    @validator("dependencies", "upgrade_supported", each_item=True)
-    @classmethod
-    def dependencies_validator(cls, value):
-        """Validates version constraint."""
-        if isinstance(value, dict):
-            deps = value.values()
-        else:
-            deps = [value]
-
-        for dep in deps:
-            poetry_version.parse_constraint(dep)
-
-        return value
-
-    @root_validator(skip_on_failure=True)
-    @classmethod
-    def version_upgrade_supported_validator(cls, values):
-        """Validates specified `version` meets `upgrade_supported` requirement."""
-        if not verify_requirements(
-            version=values.get("version"), requirement=values.get("upgrade_supported")
-        ):
-            raise ValueError(
-                f"upgrade_supported value {values.get('upgrade_supported')} greater than version value {values.get('version')} for {values.get('name')}."
-            )
-
-        return values
-
-    def can_upgrade(self, dependency: "DependencyModel") -> bool:
-        """Compares two instances of :class:`DependencyModel` for upgradability.
-
-        Args:
-            dependency: a dependency model to compare this model against
-
-        Returns:
-            True if current model can upgrade from dependent model. Otherwise False
-        """
-        return verify_requirements(version=self.version, requirement=dependency.upgrade_supported)
-
-
-# --- CUSTOM EXCEPTIONS ---
-
-
-class UpgradeError(Exception):
-    """Base class for upgrade related exceptions in the module."""
-
-    def __init__(self, message: str, cause: Optional[str], resolution: Optional[str]):
-        super().__init__(message)
-        self.message = message
-        self.cause = cause or ""
-        self.resolution = resolution or ""
-
-    def __repr__(self):
-        """Representation of the UpgradeError class."""
-        return f"{type(self).__module__}.{type(self).__name__} - {str(vars(self))}"
-
-    def __str__(self):
-        """String representation of the UpgradeError class."""
-        return repr(self)
-
-
-class ClusterNotReadyError(UpgradeError):
-    """Exception flagging that the cluster is not ready to start upgrading.
-
-    For example, if the cluster fails :class:`DataUpgrade._on_pre_upgrade_check_action`
-
-    Args:
-        message: string message to be logged out
-        cause: short human-readable description of the cause of the error
-        resolution: short human-readable instructions for manual error resolution (optional)
-    """
-
-    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
-        super().__init__(message, cause=cause, resolution=resolution)
-
-
-class KubernetesClientError(UpgradeError):
-    """Exception flagging that a call to Kubernetes API failed.
-
-    For example, if the cluster fails :class:`DataUpgrade._set_rolling_update_partition`
-
-    Args:
-        message: string message to be logged out
-        cause: short human-readable description of the cause of the error
-        resolution: short human-readable instructions for manual error resolution (optional)
-    """
-
-    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
-        super().__init__(message, cause=cause, resolution=resolution)
-
-
-class VersionError(UpgradeError):
-    """Exception flagging that the old `version` fails to meet the new `upgrade_supported`s.
-
-    For example, upgrades from version `2.x` --> `4.x`,
-        but `4.x` only supports upgrading from `3.x` onwards
-
-    Args:
-        message: string message to be logged out
-        cause: short human-readable description of the cause of the error
-        resolution: short human-readable instructions for manual solutions to the error (optional)
-    """
-
-    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
-        super().__init__(message, cause=cause, resolution=resolution)
-
-
-class DependencyError(UpgradeError):
-    """Exception flagging that some new `dependency` is not being met.
-
-    For example, new version requires related App version `2.x`, but currently is `1.x`
-
-    Args:
-        message: string message to be logged out
-        cause: short human-readable description of the cause of the error
-        resolution: short human-readable instructions for manual solutions to the error (optional)
-    """
-
-    def __init__(self, message: str, cause: str, resolution: Optional[str] = None):
-        super().__init__(message, cause=cause, resolution=resolution)
-
-
-# --- CUSTOM EVENTS ---
-
-
-class UpgradeGrantedEvent(EventBase):
-    """Used to tell units that they can process an upgrade."""
-
-
-class UpgradeFinishedEvent(EventBase):
-    """Used to tell units that they finished the upgrade."""
-
-
-class UpgradeEvents(CharmEvents):
-    """Upgrade events.
-
-    This class defines the events that the lib can emit.
-    """
-
-    upgrade_granted = EventSource(UpgradeGrantedEvent)
-    upgrade_finished = EventSource(UpgradeFinishedEvent)
-
-
-# --- EVENT HANDLER ---
-
-
-class DataUpgrade(Object, ABC):
-    """Manages `upgrade` relation operations for in-place upgrades."""
-
-    STATES = ["recovery", "failed", "idle", "ready", "upgrading", "completed"]
-
-    on = UpgradeEvents()  # pyright: ignore [reportAssignmentType]
-
-    def __init__(
-        self,
-        charm: CharmBase,
-        dependency_model: BaseModel,
-        relation_name: str = "upgrade",
-        substrate: Literal["vm", "k8s"] = "vm",
-    ):
-        super().__init__(charm, relation_name)
-        self.charm = charm
-        self.dependency_model = dependency_model
-        self.relation_name = relation_name
-        self.substrate = substrate
-        self._upgrade_stack = None
-
-        # events
-        self.framework.observe(
-            self.charm.on[relation_name].relation_created, self._on_upgrade_created
-        )
-        self.framework.observe(
-            self.charm.on[relation_name].relation_changed, self.on_upgrade_changed
-        )
-        self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm)
-        self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted)
-        self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished)
-
-        # actions
-        self.framework.observe(
-            getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action
-        )
-        if self.substrate == "k8s":
-            self.framework.observe(
-                getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
-            )
-
-    @property
-    def peer_relation(self) -> Optional[Relation]:
-        """The upgrade peer relation."""
-        return self.charm.model.get_relation(self.relation_name)
-
-    @property
-    def app_units(self) -> Set[Unit]:
-        """The peer-related units in the application."""
-        if not self.peer_relation:
-            return set()
-
-        return set([self.charm.unit] + list(self.peer_relation.units))
-
-    @property
-    def state(self) -> Optional[str]:
-        """The unit state from the upgrade peer relation."""
-        if not self.peer_relation:
-            return None
-
-        return self.peer_relation.data[self.charm.unit].get("state", None)
-
-    @property
-    def stored_dependencies(self) -> Optional[BaseModel]:
-        """The application dependencies from the upgrade peer relation."""
-        if not self.peer_relation:
-            return None
-
-        if not (deps := self.peer_relation.data[self.charm.app].get("dependencies", "")):
-            return None
-
-        return type(self.dependency_model)(**json.loads(deps))
-
-    @property
-    def upgrade_stack(self) -> Optional[List[int]]:
-        """Gets the upgrade stack from the upgrade peer relation.
-
-        Unit.ids are ordered Last-In-First-Out (LIFO).
-            i.e unit.id at index `-1` is the first unit to upgrade.
-            unit.id at index `0` is the last unit to upgrade.
-
-        Returns:
-            List of integer unit.ids, ordered in upgrade order in a stack
-        """
-        if not self.peer_relation:
-            return None
-
-        # lazy-load
-        if self._upgrade_stack is None:
-            self._upgrade_stack = (
-                json.loads(self.peer_relation.data[self.charm.app].get("upgrade-stack", "[]"))
-                or None
-            )
-
-        return self._upgrade_stack
-
-    @upgrade_stack.setter
-    def upgrade_stack(self, stack: List[int]) -> None:
-        """Sets the upgrade stack to the upgrade peer relation.
-
-        Unit.ids are ordered Last-In-First-Out (LIFO).
-            i.e unit.id at index `-1` is the first unit to upgrade.
-            unit.id at index `0` is the last unit to upgrade.
-        """
-        if not self.peer_relation:
-            return
-
-        self.peer_relation.data[self.charm.app].update({"upgrade-stack": json.dumps(stack)})
-        self._upgrade_stack = stack
-
-    @property
-    def other_unit_states(self) -> list:
-        """Current upgrade state for other units.
-
-        Returns:
-            Unsorted list of upgrade states for other units.
-        """
-        if not self.peer_relation:
-            return []
-
-        return [
-            self.peer_relation.data[unit].get("state", "")
-            for unit in list(self.peer_relation.units)
-        ]
-
-    @property
-    def unit_states(self) -> list:
-        """Current upgrade state for all units.
-
-        Returns:
-            Unsorted list of upgrade states for all units.
-        """
-        if not self.peer_relation:
-            return []
-
-        return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units]
-
-    @property
-    def cluster_state(self) -> Optional[str]:
-        """Current upgrade state for cluster units.
-
-        Determined from :class:`DataUpgrade.STATE`, taking the lowest ordinal unit state.
-
-        For example, if units in have states: `["ready", "upgrading", "completed"]`,
-            the overall state for the cluster is `ready`.
-
-        Returns:
-            String of upgrade state from the furthest behind unit.
-        """
-        if not self.unit_states:
-            return None
-
-        try:
-            return sorted(self.unit_states, key=self.STATES.index)[0]
-        except (ValueError, KeyError):
-            return None
-
-    @property
-    def idle(self) -> Optional[bool]:
-        """Flag for whether the cluster is in an idle upgrade state.
-
-        Returns:
-            True if all application units in idle state. Otherwise False
-        """
-        return set(self.unit_states) == {"idle"}
-
-    @abstractmethod
-    def pre_upgrade_check(self) -> None:
-        """Runs necessary checks validating the cluster is in a healthy state to upgrade.
-
-        Called by all units during :meth:`_on_pre_upgrade_check_action`.
-
-        Raises:
-            :class:`ClusterNotReadyError`: if cluster is not ready to upgrade
-        """
-        pass
-
-    def build_upgrade_stack(self) -> List[int]:
-        """Builds ordered iterable of all application unit.ids to upgrade in.
-
-        Called by leader unit during :meth:`_on_pre_upgrade_check_action`.
-
-        Returns:
-            Iterable of integer unit.ids, LIFO ordered in upgrade order
-                i.e `[5, 2, 4, 1, 3]`, unit `3` upgrades first, `5` upgrades last
-        """
-        # don't raise if k8s substrate, uses default statefulset order
-        if self.substrate == "k8s":
-            return []
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def log_rollback_instructions(self) -> None:
-        """Sets charm state and logs out rollback instructions.
-
-        Called by all units when `state=failed` found during :meth:`_on_upgrade_changed`.
-        """
-        pass
-
-    def _repair_upgrade_stack(self) -> None:
-        """Ensures completed units are re-added to the upgrade-stack after failure."""
-        # need to update the stack as it was not refreshed by rollback run of pre-upgrade-check
-        # avoids difficult health check implementation by charm-authors needing to exclude dead units
-
-        # if the first unit in the stack fails, the stack will be the same length as units
-        # i.e this block not ran
-        if (
-            self.cluster_state in ["failed", "recovery"]
-            and self.upgrade_stack
-            and len(self.upgrade_stack) != len(self.app_units)
-            and self.charm.unit.is_leader()
-        ):
-            new_stack = self.upgrade_stack
-            for unit in self.app_units:
-                unit_id = int(unit.name.split("/")[1])
-
-                # if a unit fails, it rolls back first
-                if unit_id not in new_stack:
-                    new_stack.insert(-1, unit_id)
-                    logger.debug(f"Inserted {unit_id} in to upgrade-stack - {new_stack}")
-
-            self.upgrade_stack = new_stack
-
-    def set_unit_failed(self, cause: Optional[str] = None) -> None:
-        """Sets unit `state=failed` to the upgrade peer data.
-
-        Args:
-            cause: short description of cause of failure
-        """
-        if not self.peer_relation:
-            return None
-
-        # needed to refresh the stack
-        # now leader pulls a fresh stack from newly updated relation data
-        if self.charm.unit.is_leader():
-            self._upgrade_stack = None
-
-        self.charm.unit.status = BlockedStatus(cause if cause else "")
-        self.peer_relation.data[self.charm.unit].update({"state": "failed"})
-        self.log_rollback_instructions()
-
-    def set_unit_completed(self) -> None:
-        """Sets unit `state=completed` to the upgrade peer data."""
-        if not self.peer_relation:
-            return None
-
-        # needed to refresh the stack
-        # now leader pulls a fresh stack from newly updated relation data
-        if self.charm.unit.is_leader():
-            self._upgrade_stack = None
-
-        self.charm.unit.status = MaintenanceStatus("upgrade completed")
-        self.peer_relation.data[self.charm.unit].update({"state": "completed"})
-
-        # Emit upgrade_finished event to run unit's post upgrade operations.
-        if self.substrate == "k8s":
-            logger.debug(
-                f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..."
-            )
-            getattr(self.on, "upgrade_finished").emit()
-
-    def _on_upgrade_created(self, event: RelationCreatedEvent) -> None:
-        """Handler for `upgrade-relation-created` events."""
-        if not self.peer_relation:
-            event.defer()
-            return
-
-        # setting initial idle state needed to avoid execution on upgrade-changed events
-        self.peer_relation.data[self.charm.unit].update({"state": "idle"})
-
-        if self.charm.unit.is_leader():
-            logger.debug("Persisting dependencies to upgrade relation data...")
-            self.peer_relation.data[self.charm.app].update(
-                {"dependencies": json.dumps(self.dependency_model.dict())}
-            )
-
-    def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None:
-        """Handler for `pre-upgrade-check-action` events."""
-        if not self.peer_relation:
-            event.fail(message="Could not find upgrade relation.")
-            return
-
-        if not self.charm.unit.is_leader():
-            event.fail(message="Action must be ran on the Juju leader.")
-            return
-
-        if self.cluster_state == "failed":
-            logger.info("Entering recovery state for rolling-back to previous version...")
-            self._repair_upgrade_stack()
-            self.charm.unit.status = BlockedStatus("ready to rollback application")
-            self.peer_relation.data[self.charm.unit].update({"state": "recovery"})
-            return
-
-        # checking if upgrade in progress
-        if self.cluster_state != "idle":
-            event.fail("Cannot run pre-upgrade checks, cluster already upgrading.")
-            return
-
-        try:
-            logger.info("Running pre-upgrade-check...")
-            self.pre_upgrade_check()
-
-            if self.substrate == "k8s":
-                logger.info("Building upgrade-stack for K8s...")
-                built_upgrade_stack = sorted(
-                    [int(unit.name.split("/")[1]) for unit in self.app_units]
-                )
-            else:
-                logger.info("Building upgrade-stack for VMs...")
-                built_upgrade_stack = self.build_upgrade_stack()
-
-            logger.debug(f"Built upgrade stack of {built_upgrade_stack}")
-
-        except ClusterNotReadyError as e:
-            logger.error(e)
-            event.fail(message=e.message)
-            return
-        except Exception as e:
-            logger.error(e)
-            event.fail(message="Unknown error found.")
-            return
-
-        logger.info("Setting upgrade-stack to relation data...")
-        self.upgrade_stack = built_upgrade_stack
-
-    def _on_resume_upgrade_action(self, event: ActionEvent) -> None:
-        """Handle resume upgrade action.
-
-        Continue the upgrade by setting the partition to the next unit.
-        """
-        if not self.peer_relation:
-            event.fail(message="Could not find upgrade relation.")
-            return
-
-        if not self.charm.unit.is_leader():
-            event.fail(message="Action must be ran on the Juju leader.")
-            return
-
-        if not self.upgrade_stack:
-            event.fail(message="Nothing to resume, upgrade stack unset.")
-            return
-
-        # Check whether this is being run after juju refresh was called
-        # (the size of the upgrade stack should match the number of total
-        # unit minus one).
-        if len(self.upgrade_stack) != len(self.peer_relation.units):
-            event.fail(message="Upgrade can be resumed only once after juju refresh is called.")
-            return
-
-        try:
-            next_partition = self.upgrade_stack[-1]
-            self._set_rolling_update_partition(partition=next_partition)
-            event.set_results({"message": f"Upgrade will resume on unit {next_partition}"})
-        except KubernetesClientError:
-            event.fail(message="Cannot set rolling update partition.")
-
-    def _upgrade_supported_check(self) -> None:
-        """Checks if previous versions can be upgraded to new versions.
-
-        Raises:
-            :class:`VersionError` if upgrading to existing `version` is not supported
-        """
-        keys = self.dependency_model.__fields__.keys()
-
-        compatible = True
-        incompatibilities: List[Tuple[str, str, str, str]] = []
-        for key in keys:
-            old_dep: DependencyModel = getattr(self.stored_dependencies, key)
-            new_dep: DependencyModel = getattr(self.dependency_model, key)
-
-            if not old_dep.can_upgrade(dependency=new_dep):
-                compatible = False
-                incompatibilities.append(
-                    (key, old_dep.version, new_dep.version, new_dep.upgrade_supported)
-                )
-
-        base_message = "Versions incompatible"
-        base_cause = "Upgrades only supported for specific versions"
-        if not compatible:
-            for incompat in incompatibilities:
-                base_message += (
-                    f", {incompat[0]} {incompat[1]} can not be upgraded to {incompat[2]}"
-                )
-                base_cause += f", {incompat[0]} versions satisfying requirement {incompat[3]}"
-
-            raise VersionError(
-                message=base_message,
-                cause=base_cause,
-            )
-
-    def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
-        """Handler for `upgrade-charm` events."""
-        # defer if not all units have pre-upgraded
-        if not self.peer_relation:
-            event.defer()
-            return
-
-        if not self.upgrade_stack:
-            logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.")
-            return
-
-        if self.substrate == "vm":
-            # for VM run version checks on leader only
-            if self.charm.unit.is_leader():
-                try:
-                    self._upgrade_supported_check()
-                except VersionError as e:  # not ready if not passed check
-                    logger.error(e)
-                    self.set_unit_failed()
-                    return
-            self.charm.unit.status = WaitingStatus("other units upgrading first...")
-            self.peer_relation.data[self.charm.unit].update({"state": "ready"})
-
-            if self.charm.app.planned_units() == 1:
-                # single unit upgrade, emit upgrade_granted event right away
-                getattr(self.on, "upgrade_granted").emit()
-
-        else:
-            # for k8s run version checks only on highest ordinal unit
-            if (
-                self.charm.unit.name
-                == f"{self.charm.app.name}/{self.charm.app.planned_units() -1}"
-            ):
-                try:
-                    self._upgrade_supported_check()
-                except VersionError as e:  # not ready if not passed check
-                    logger.error(e)
-                    self.set_unit_failed()
-                    return
-            # On K8s an unit that receives the upgrade-charm event is upgrading
-            self.charm.unit.status = MaintenanceStatus("upgrading unit")
-            self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})
-
-    def on_upgrade_changed(self, event: EventBase) -> None:
-        """Handler for `upgrade-relation-changed` events."""
-        if not self.peer_relation:
-            return
-
-        # if any other unit failed, don't continue with upgrade
-        if self.cluster_state == "failed":
-            logger.debug("Cluster failed to upgrade, exiting...")
-            return
-
-        if self.substrate == "vm" and self.cluster_state == "recovery":
-            # skip run while in recovery. The event will be retrigged when the cluster is ready
-            logger.debug("Cluster in recovery, skip...")
-            return
-
-        # if all units completed, mark as complete
-        if not self.upgrade_stack:
-            if self.state == "completed" and self.cluster_state in ["idle", "completed"]:
-                logger.info("All units completed upgrade, setting idle upgrade state...")
-                self.charm.unit.status = ActiveStatus()
-                self.peer_relation.data[self.charm.unit].update({"state": "idle"})
-
-                if self.charm.unit.is_leader():
-                    logger.debug("Persisting new dependencies to upgrade relation data...")
-                    self.peer_relation.data[self.charm.app].update(
-                        {"dependencies": json.dumps(self.dependency_model.dict())}
-                    )
-                return
-
-            if self.cluster_state == "idle":
-                logger.debug("upgrade-changed event handled before pre-checks, exiting...")
-                return
-
-            logger.debug("Did not find upgrade-stack or completed cluster state, skipping...")
-            return
-
-        # upgrade ongoing, set status for waiting units
-        if "upgrading" in self.unit_states and self.state in ["idle", "ready"]:
-            self.charm.unit.status = WaitingStatus("other units upgrading first...")
-
-        # pop mutates the `upgrade_stack` attr
-        top_unit_id = self.upgrade_stack.pop()
-        top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}")
-        top_state = self.peer_relation.data[top_unit].get("state")
-
-        # if top of stack is completed, leader pops it
-        if self.charm.unit.is_leader() and top_state == "completed":
-            logger.debug(f"{top_unit} has finished upgrading, updating stack...")
-
-            # writes the mutated attr back to rel data
-            self.peer_relation.data[self.charm.app].update(
-                {"upgrade-stack": json.dumps(self.upgrade_stack)}
-            )
-
-            # recurse on leader to ensure relation changed event not lost
-            # in case leader is next or the last unit to complete
-            self.on_upgrade_changed(event)
-
-        # if unit top of stack and all units ready (i.e stack), emit granted event
-        if (
-            self.charm.unit == top_unit
-            and top_state in ["ready", "upgrading"]
-            and self.cluster_state == "ready"
-            and "upgrading" not in self.other_unit_states
-        ):
-            logger.debug(
-                f"{top_unit.name} is next to upgrade, emitting `upgrade_granted` event and upgrading..."
-            )
-            self.charm.unit.status = MaintenanceStatus("upgrading...")
-            self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})
-
-            try:
-                getattr(self.on, "upgrade_granted").emit()
-            except DependencyError as e:
-                logger.error(e)
-                self.set_unit_failed()
-                return
-
-    def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
-        """Handler for `upgrade-granted` events.
-
-        Handlers of this event must meet the following:
-            - SHOULD check for related application deps from :class:`DataUpgrade.dependencies`
-                - MAY raise :class:`DependencyError` if dependency not met
-            - MUST update unit `state` after validating the success of the upgrade, calling one of:
-                - :class:`DataUpgrade.set_unit_failed` if the unit upgrade fails
-                - :class:`DataUpgrade.set_unit_completed` if the unit upgrade succeeds
-            - MUST call :class:`DataUpgarde.on_upgrade_changed` on exit so event not lost on leader
-        """
-        # don't raise if k8s substrate, only return
-        if self.substrate == "k8s":
-            return
-
-        raise NotImplementedError
-
-    def _on_upgrade_finished(self, _) -> None:
-        """Handler for `upgrade-finished` events."""
-        if self.substrate == "vm" or not self.peer_relation:
-            return
-
-        # Emit the upgrade relation changed event in the leader to update the upgrade_stack.
-        if self.charm.unit.is_leader():
-            self.charm.on[self.relation_name].relation_changed.emit(
-                self.model.get_relation(self.relation_name)
-            )
-
-        # This hook shouldn't run for the last unit (the first that is upgraded). For that unit it
-        # should be done through an action after the upgrade success on that unit is double-checked.
-        unit_number = int(self.charm.unit.name.split("/")[1])
-        if unit_number == len(self.peer_relation.units):
-            logger.info(
-                f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade"
-            )
-            return
-
-        # Also, the hook shouldn't run for the first unit (the last that is upgraded).
-        if unit_number == 0:
-            logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete")
-            return
-
-        try:
-            # Use the unit number instead of the upgrade stack to avoid race conditions
-            # (i.e. the leader updates the upgrade stack after this hook runs).
-            next_partition = unit_number - 1
-            logger.debug(f"Set rolling update partition to unit {next_partition}")
-            self._set_rolling_update_partition(partition=next_partition)
-        except KubernetesClientError:
-            logger.exception("Cannot set rolling update partition")
-            self.set_unit_failed()
-            self.log_rollback_instructions()
-
-    def _set_rolling_update_partition(self, partition: int) -> None:
-        """Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`.
-
-        Args:
-            partition: partition to set.
-
-        K8s only. It should decrement the rolling update strategy partition by using a code
-        like the following:
-
-            from lightkube.core.client import Client
-            from lightkube.core.exceptions import ApiError
-            from lightkube.resources.apps_v1 import StatefulSet
-
-            try:
-                patch = {"spec": {"updateStrategy": {"rollingUpdate": {"partition": partition}}}}
-                Client().patch(StatefulSet, name=self.charm.model.app.name, namespace=self.charm.model.name, obj=patch)
-                logger.debug(f"Kubernetes StatefulSet partition set to {partition}")
-            except ApiError as e:
-                if e.status.code == 403:
-                    cause = "`juju trust` needed"
-                else:
-                    cause = str(e)
-                raise KubernetesClientError("Kubernetes StatefulSet patch failed", cause)
-        """
-        if self.substrate == "vm":
-            return
-
-        raise NotImplementedError
diff --git a/lib/charms/mongodb/v0/config_server_interface.py b/lib/charms/mongodb/v0/config_server_interface.py
index dadf4199f..e4bdf2e7b 100644
--- a/lib/charms/mongodb/v0/config_server_interface.py
+++ b/lib/charms/mongodb/v0/config_server_interface.py
@@ -82,12 +82,14 @@ def pass_hook_checks(self, event: EventBase) -> bool:
             )
             return False
 
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
+        if not self.charm.unit.is_leader():
             return False
 
-        if not self.charm.unit.is_leader():
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Processing mongos applications is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
+            event.defer()
             return False
 
         return True
@@ -135,6 +137,11 @@ def _on_relation_changed(self, event) -> None:
         self.database_provides.update_relation_data(event.relation.id, relation_data)
 
     def _on_relation_broken(self, event) -> None:
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Removing integration to mongos is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
+
         # Only relation_deparated events can check if scaling down
         departed_relation_id = event.relation.id
         if not self.charm.has_departed_run(departed_relation_id):
@@ -177,7 +184,7 @@ def generate_config_server_db(self) -> str:
         """Generates the config server database for mongos to connect to."""
         replica_set_name = self.charm.app.name
         hosts = []
-        for host in self.charm._unit_ips:
+        for host in self.charm.unit_ips:
             hosts.append(f"{host}:{Config.MONGODB_PORT}")
 
         hosts = ",".join(hosts)
diff --git a/lib/charms/mongodb/v0/mongodb.py b/lib/charms/mongodb/v0/mongodb.py
index 2b2495e96..cb45dd564 100644
--- a/lib/charms/mongodb/v0/mongodb.py
+++ b/lib/charms/mongodb/v0/mongodb.py
@@ -38,6 +38,10 @@
 logger = logging.getLogger(__name__)
 
 
+class FailedToMovePrimaryError(Exception):
+    """Raised when attempt to move a primary fails."""
+
+
 @dataclass
 class MongoDBConfiguration:
     """Class for MongoDB configuration.
@@ -290,6 +294,60 @@ def step_down_primary(self) -> None:
         """Steps down the current primary, forcing a re-election."""
         self.client.admin.command("replSetStepDown", {"stepDownSecs": "60"})
 
+    def move_primary(self, new_primary_ip: str) -> None:
+        """Forcibly moves the primary to the new primary provided.
+
+        Args:
+            new_primary_ip: ip address of the unit chosen to be the new primary.
+        """
+        # Do not move a priary unless the cluster is in sync
+        rs_status = self.client.admin.command("replSetGetStatus")
+        if self.is_any_sync(rs_status):
+            # it can take a while, we should defer
+            raise NotReadyError
+
+        is_move_successful = True
+        self.set_replicaset_election_priority(priority=0.5, ignore_member=new_primary_ip)
+        try:
+            for attempt in Retrying(stop=stop_after_delay(180), wait=wait_fixed(3)):
+                with attempt:
+                    self.step_down_primary()
+                    if self.primary() != new_primary_ip:
+                        raise FailedToMovePrimaryError
+        except RetryError:
+            # catch all possible exceptions when failing to step down primary. We do this in order
+            # to ensure that we reset the replica set election priority.
+            is_move_successful = False
+
+        # reset all replicas to the same priority
+        self.set_replicaset_election_priority(priority=1)
+
+        if not is_move_successful:
+            raise FailedToMovePrimaryError
+
+    def set_replicaset_election_priority(self, priority: int, ignore_member: str = None) -> None:
+        """Set the election priority for the entire replica set."""
+        rs_config = self.client.admin.command("replSetGetConfig")
+        rs_config = rs_config["config"]
+        rs_config["version"] += 1
+
+        # keep track of the original configuration before setting the priority, reconfiguring the
+        # replica set can result in primary re-election, which would would like to avoid when
+        # possible.
+        original_rs_config = rs_config
+
+        for member in rs_config["members"]:
+            if member["host"] == ignore_member:
+                continue
+
+            member["priority"] = priority
+
+        if original_rs_config == rs_config:
+            return
+
+        logger.debug("rs_config: %r", rs_config)
+        self.client.admin.command("replSetReconfig", rs_config)
+
     def create_user(self, config: MongoDBConfiguration):
         """Create user.
 
diff --git a/lib/charms/mongodb/v0/mongodb_tls.py b/lib/charms/mongodb/v0/mongodb_tls.py
index a152f9fe3..a421e1b21 100644
--- a/lib/charms/mongodb/v0/mongodb_tls.py
+++ b/lib/charms/mongodb/v0/mongodb_tls.py
@@ -73,10 +73,6 @@ def is_tls_enabled(self, internal: bool):
 
     def _on_set_tls_private_key(self, event: ActionEvent) -> None:
         """Set the TLS private key, which will be used for requesting the certificate."""
-        if not self.charm.upgrade.idle:
-            event.fail("Cannot set TLS key - upgrade is in progress.")
-            return
-
         logger.debug("Request to set TLS private key received.")
         if self.charm.is_role(Config.Role.MONGOS) and not self.charm.has_config_server():
             logger.error(
@@ -85,6 +81,11 @@ def _on_set_tls_private_key(self, event: ActionEvent) -> None:
             event.fail("Mongos cannot set TLS keys until integrated to config-server.")
             return
 
+        if self.charm.upgrade_in_progress:
+            logger.warning("Setting TLS key during an upgrade is not supported.")
+            event.fail("Setting TLS key during an upgrade is not supported.")
+            return
+
         try:
             self.request_certificate(event.params.get("external-key", None), internal=False)
             self.request_certificate(event.params.get("internal-key", None), internal=True)
@@ -145,8 +146,10 @@ def _on_tls_relation_joined(self, event: RelationJoinedEvent) -> None:
             event.defer()
             return
 
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Enabling TLS is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
             event.defer()
             return
 
@@ -155,12 +158,11 @@ def _on_tls_relation_joined(self, event: RelationJoinedEvent) -> None:
 
     def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None:
         """Disable TLS when TLS relation broken."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
-
         logger.debug("Disabling external and internal TLS for unit: %s", self.charm.unit.name)
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Disabling TLS is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
 
         for internal in [True, False]:
             self.set_tls_secret(internal, Config.TLS.SECRET_CA_LABEL, None)
@@ -179,11 +181,6 @@ def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None:
 
     def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
         """Enable TLS when TLS certificate available."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
-
         if self.charm.is_role(Config.Role.MONGOS) and not self.charm.config_server_db:
             logger.debug(
                 "mongos requires config-server in order to start, do not restart with TLS until integrated to config-server"
@@ -251,11 +248,6 @@ def waiting_for_certs(self):
 
     def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
         """Request the new certificate when old certificate is expiring."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
-
         if self.charm.is_role(Config.Role.MONGOS) and not self.charm.has_config_server():
             logger.info(
                 "mongos is not running (not integrated to config-server) deferring renewal of certificates."
@@ -344,7 +336,7 @@ def get_tls_files(self, internal: bool) -> Tuple[Optional[str], Optional[str]]:
     def get_host(self, unit: Unit):
         """Retrieves the hostname of the unit based on the substrate."""
         if self.substrate == "vm":
-            return self.charm._unit_ip(unit)
+            return self.charm.unit_ip(unit)
         else:
             return self.charm.get_hostname_for_unit(unit)
 
diff --git a/lib/charms/mongodb/v1/mongodb_backups.py b/lib/charms/mongodb/v1/mongodb_backups.py
index 9828d7a6b..06c05b66d 100644
--- a/lib/charms/mongodb/v1/mongodb_backups.py
+++ b/lib/charms/mongodb/v1/mongodb_backups.py
@@ -125,10 +125,12 @@ def __init__(self, charm):
 
     def on_s3_relation_joined(self, event: RelationJoinedEvent) -> None:
         """Checks for valid integration for s3-integrations."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Adding s3-relations is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
             event.defer()
-            return False
+            return
 
         if not self.is_valid_s3_integration():
             logger.debug(
@@ -143,6 +145,13 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
         # handling PBM configurations requires that MongoDB is running and the pbm snap is
         # installed.
         action = "configure-pbm"
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Changing s3-credentials is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
+            event.defer()
+            return
+
         if not self._pass_sanity_checks(event, action):
             return
 
@@ -162,6 +171,11 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
 
     def _on_create_backup_action(self, event) -> None:
         action = "backup"
+        if self.charm.upgrade_in_progress:
+            logger.debug("Creating a backup is not supported during an upgrade.")
+            event.fail("Creating a backup is not supported during an upgrade.")
+            return
+
         if not self._pass_sanity_checks(event, action):
             return
 
@@ -241,6 +255,11 @@ def _on_list_backups_action(self, event) -> None:
 
     def _on_restore_action(self, event) -> None:
         action = "restore"
+        if self.charm.upgrade_in_progress:
+            logger.debug("Restoring a backup is not supported during an upgrade.")
+            event.fail("Restoring a backup is not supported during an upgrade.")
+            return
+
         if not self._pass_sanity_checks(event, action):
             return
 
@@ -329,11 +348,6 @@ def _pass_sanity_checks(self, event, action) -> bool:
 
         No matter what backup-action is being run, these requirements must be met.
         """
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return False
-
         if not self.is_valid_s3_integration():
             self._fail_action_with_error_log(
                 event,
@@ -729,7 +743,7 @@ def retrieve_error_message(self, pbm_status: Dict) -> str:
 
             for host_info in cluster["nodes"]:
                 replica_info = (
-                    f"mongodb/{self.charm._unit_ip(self.charm.unit)}:{Config.MONGOS_PORT}"
+                    f"mongodb/{self.charm.unit_ip(self.charm.unit)}:{Config.MONGOS_PORT}"
                 )
                 if host_info["host"] == replica_info:
                     break
diff --git a/lib/charms/mongodb/v1/mongodb_provider.py b/lib/charms/mongodb/v1/mongodb_provider.py
index 1b59ea7ab..a290844ea 100644
--- a/lib/charms/mongodb/v1/mongodb_provider.py
+++ b/lib/charms/mongodb/v1/mongodb_provider.py
@@ -89,11 +89,6 @@ def pass_hook_checks(self, event: EventBase) -> bool:
         """Runs the pre-hooks checks for MongoDBProvider, returns True if all pass."""
         # We shouldn't try to create or update users if the database is not
         # initialised. We will create users as part of initialisation.
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return False
-
         if not self.charm.db_initialised:
             return False
 
@@ -110,6 +105,13 @@ def pass_hook_checks(self, event: EventBase) -> bool:
         if not self.charm.unit.is_leader():
             return False
 
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Adding relations is not supported during an upgrade. The charm may be in a broken, unrecoverable state."
+            )
+            event.defer()
+            return False
+
         return True
 
     def _on_relation_event(self, event):
diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py
index fda0315bd..be3331479 100644
--- a/lib/charms/mongodb/v1/shards_interface.py
+++ b/lib/charms/mongodb/v1/shards_interface.py
@@ -132,7 +132,7 @@ def _on_relation_joined(self, event):
             KEYFILE_KEY: self.charm.get_secret(
                 Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME
             ),
-            HOSTS_KEY: json.dumps(self.charm._unit_ips),
+            HOSTS_KEY: json.dumps(self.charm.unit_ips),
         }
 
         # if tls enabled
@@ -144,13 +144,8 @@ def _on_relation_joined(self, event):
 
         self.database_provides.update_relation_data(event.relation.id, relation_data)
 
-    def pass_hook_checks(self, event: EventBase) -> bool:
-        """Runs the pre-hooks checks for ShardingProvider, returns True if all pass."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return False
-
+    def pass_sanity_hook_checks(self, event: EventBase) -> bool:
+        """Returns True if all the sanity hook checks for sharding pass."""
         if not self.charm.db_initialised:
             logger.info("Deferring %s. db is not initialised.", str(type(event)))
             event.defer()
@@ -170,6 +165,13 @@ def pass_hook_checks(self, event: EventBase) -> bool:
         if not self.charm.unit.is_leader():
             return False
 
+        return True
+
+    def pass_hook_checks(self, event: EventBase) -> bool:
+        """Runs the pre-hooks checks for ShardingProvider, returns True if all pass."""
+        if not self.pass_sanity_hook_checks(event):
+            return False
+
         # adding/removing shards while a backup/restore is in progress can be disastrous
         pbm_status = self.charm.backups.get_pbm_status()
         if isinstance(pbm_status, MaintenanceStatus):
@@ -178,6 +180,12 @@ def pass_hook_checks(self, event: EventBase) -> bool:
             return False
 
         if isinstance(event, RelationBrokenEvent):
+            if self.charm.upgrade_in_progress:
+                # upgrades should not block the relation broken event
+                logger.warning(
+                    "Removing shards is not supported during an upgrade. The charm may be in a broken, unrecoverable state"
+                )
+
             if not self.charm.has_departed_run(event.relation.id):
                 logger.info(
                     "Deferring, must wait for relation departed hook to decide if relation should be removed."
@@ -187,6 +195,12 @@ def pass_hook_checks(self, event: EventBase) -> bool:
 
             if not self.charm.proceed_on_broken_event(event):
                 return False
+        elif self.charm.upgrade_in_progress:
+            logger.warning(
+                "Adding/Removing/Updating shards is not supported during an upgrade. The charm may be in a broken, unrecoverable state"
+            )
+            event.defer()
+            return False
 
         return True
 
@@ -317,7 +331,7 @@ def update_mongos_hosts(self):
             return
 
         for relation in self.charm.model.relations[self.relation_name]:
-            self._update_relation_data(relation.id, {HOSTS_KEY: json.dumps(self.charm._unit_ips)})
+            self._update_relation_data(relation.id, {HOSTS_KEY: json.dumps(self.charm.unit_ips)})
 
     def update_ca_secret(self, new_ca: str) -> None:
         """Updates the new CA for all related shards."""
@@ -453,7 +467,7 @@ def get_unreachable_shards(self) -> List[str]:
 
     def is_mongos_running(self) -> bool:
         """Returns true if mongos service is running."""
-        mongos_hosts = ",".join(self.charm._unit_ips)
+        mongos_hosts = ",".join(self.charm.unit_ips)
         uri = f"mongodb://{mongos_hosts}"
         with MongosConnection(None, uri) as mongo:
             return mongo.is_ready
@@ -547,11 +561,6 @@ def _handle_changed_secrets(self, event) -> None:
         Changes in secrets do not re-trigger a relation changed event, so it is necessary to listen
         to secret changes events.
         """
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return False
-
         if (
             not self.charm.unit.is_leader()
             or not event.secret.label
@@ -655,11 +664,6 @@ def sync_cluster_passwords(
 
     def _on_relation_joined(self, event: RelationJoinedEvent):
         """Sets status and flags in relation data relevant to sharding."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
-
         # if re-using an old shard, re-set flags.
         self.charm.unit_peer_data["drained"] = json.dumps(False)
         self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")
@@ -706,13 +710,35 @@ def _on_relation_changed(self, event):
 
         self.charm.app_peer_data["mongos_hosts"] = json.dumps(self.get_mongos_hosts())
 
-    def pass_hook_checks(self, event):
+    def pass_hook_checks(self, event: EventBase):
         """Runs the pre-hooks checks for ConfigServerRequirer, returns True if all pass."""
-        if not self.charm.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
+        if not self.pass_sanity_hook_checks(event):
             return False
 
+        # occasionally, broken events have no application, in these scenarios nothing should be
+        # processed.
+        if not event.relation.app and isinstance(event, RelationBrokenEvent):
+            return False
+
+        mongos_hosts = event.relation.data[event.relation.app].get(HOSTS_KEY)
+
+        if isinstance(event, RelationBrokenEvent) and not mongos_hosts:
+            logger.info("Config-server relation never set up, no need to process broken event.")
+            return False
+
+        if self.charm.upgrade_in_progress:
+            logger.warning(
+                "Adding/Removing shards is not supported during an upgrade. The charm may be in a broken, unrecoverable state"
+            )
+            if not isinstance(event, RelationBrokenEvent):
+                # upgrades should not block relation broken events
+                event.defer()
+                return False
+
+        return self.pass_tls_hook_checks(event)
+
+    def pass_sanity_hook_checks(self, event: EventBase) -> bool:
+        """Returns True if all the sanity hook checks for sharding pass."""
         if not self.charm.db_initialised:
             logger.info("Deferring %s. db is not initialised.", str(type(event)))
             event.defer()
@@ -726,17 +752,10 @@ def pass_hook_checks(self, event):
             logger.info("skipping %s is only be executed by shards", str(type(event)))
             return False
 
-        # occasionally, broken events have no application, in these scenarios nothing should be
-        # processed.
-        if not event.relation.app:
-            return False
-
-        mongos_hosts = event.relation.data[event.relation.app].get(HOSTS_KEY, None)
-
-        if isinstance(event, RelationBrokenEvent) and not mongos_hosts:
-            logger.info("Config-server relation never set up, no need to process broken event.")
-            return False
+        return True
 
+    def pass_tls_hook_checks(self, event: EventBase) -> bool:
+        """Returns True if the TLS checks for sharding pass."""
         if self.is_shard_tls_missing():
             logger.info(
                 "Deferring %s. Config-server uses TLS, but shard does not. Please synchronise encryption methods.",
@@ -761,7 +780,6 @@ def pass_hook_checks(self, event):
 
             event.defer()
             return False
-
         return True
 
     def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
diff --git a/metadata.yaml b/metadata.yaml
index d247fe05c..5e88f70ba 100644
--- a/metadata.yaml
+++ b/metadata.yaml
@@ -38,7 +38,10 @@ storage:
 peers:
   database-peers:
     interface: mongodb-peers
-  upgrade:
+  upgrade-version-a:
+    # Relation versioning scheme:
+    # DA056 - Upgrading in-place upgrade protocol
+    # https://docs.google.com/document/d/1H7qy5SAwLiCOKO9xMQJbbQP5_-jGV6Lhi-mJOk4gZ08/edit
     interface: upgrade
 
 requires:
diff --git a/pyproject.toml b/pyproject.toml
index 6ac6d2d93..11e2a326d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,8 +30,9 @@ max-complexity = 10
 exclude = [".git", "__pycache__", ".tox", "build", "dist", "*.egg_info", "venv", "tests/integration/relation_tests/new_relations/application-charm/lib"]
 select = ["E", "W", "F", "C", "N", "R", "D", "H"]
 # Ignore W503, E501 because using black creates errors with this
+# Ignore N818 Exception should be named with an Error suffix
 # Ignore D107 Missing docstring in __init__
-ignore = ["W503", "E501", "D107"]
+ignore = ["W503", "E501", "D107",  "N818"]
 # D100, D101, D102, D103: Ignore missing docstrings in tests
 per-file-ignores = ["tests/*:D100,D101,D102,D103,D104"]
 docstring-convention = "google"
diff --git a/requirements.txt b/requirements.txt
index 426ff4185..4c3750f90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,4 @@ parameterized==0.9.0
 pydantic==1.10.7
 # Future PR - use poetry in MongoDB Charm
 poetry==1.8.2
-jinja2==3.1.3
+jinja2==3.1.3
\ No newline at end of file
diff --git a/src/charm.py b/src/charm.py
index 79baba57d..e09c3d1c6 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -76,7 +76,6 @@
 from tenacity import Retrying, before_log, retry, stop_after_attempt, wait_fixed
 
 from config import Config, Package
-from events.upgrade import MongoDBDependencyModel, MongoDBUpgrade
 from exceptions import AdminUserCreationError, ApplicationHostNotFoundError
 from machine_helpers import (
     MONGO_USER,
@@ -84,6 +83,7 @@
     setup_logrotate_and_cron,
     update_mongod_service,
 )
+from upgrades.mongodb_upgrade import MongoDBUpgrade
 
 AUTH_FAILED_CODE = 18
 UNAUTHORISED_CODE = 13
@@ -136,12 +136,7 @@ def __init__(self, *args):
         self.legacy_client_relations = MongoDBLegacyProvider(self)
         self.tls = MongoDBTLS(self, Config.Relations.PEERS, substrate=Config.SUBSTRATE)
         self.backups = MongoDBBackups(self)
-        self.upgrade = MongoDBUpgrade(
-            self,
-            dependency_model=MongoDBDependencyModel(
-                **Config.DEPENDENCIES  # pyright: ignore[reportGeneralTypeIssues, reportArgumentType]
-            ),
-        )  # TODO future PR add dependency_model
+        self.upgrade = MongoDBUpgrade(self)
         self.config_server = ShardingProvider(self)
         self.cluster = ClusterProvider(self)
         self.shard = ConfigServerRequirer(self)
@@ -166,7 +161,7 @@ def _mongo_scrape_config(self) -> List[Dict]:
                 "static_configs": [
                     {
                         "targets": [
-                            f"{self._unit_ip(self.unit)}:{Config.Monitoring.MONGODB_EXPORTER_PORT}"
+                            f"{self.unit_ip(self.unit)}:{Config.Monitoring.MONGODB_EXPORTER_PORT}"
                         ],
                         "labels": {"cluster": self.app.name, "replication_set": self.app.name},
                     }
@@ -185,12 +180,12 @@ def primary(self) -> str:
             return None
 
         # check if current unit matches primary ip
-        if primary_ip == self._unit_ip(self.unit):
+        if primary_ip == self.unit_ip(self.unit):
             return self.unit.name
 
         # check if peer unit matches primary ip
         for unit in self.peers.units:
-            if primary_ip == self._unit_ip(unit):
+            if primary_ip == self.unit_ip(unit):
                 return unit.name
 
         return None
@@ -205,7 +200,7 @@ def drained(self) -> bool:
         return self.unit_peer_data.get("drained", False)
 
     @property
-    def _unit_ips(self) -> List[str]:
+    def unit_ips(self) -> List[str]:
         """Retrieve IP addresses associated with MongoDB application.
 
         Returns:
@@ -213,10 +208,10 @@ def _unit_ips(self) -> List[str]:
         """
         peer_addresses = []
         if self.peers:
-            peer_addresses = [self._unit_ip(unit) for unit in self.peers.units]
+            peer_addresses = [self.unit_ip(unit) for unit in self.peers.units]
 
         logger.debug("peer addresses: %s", peer_addresses)
-        self_address = self._unit_ip(self.unit)
+        self_address = self.unit_ip(self.unit)
         logger.debug("unit address: %s", self_address)
         addresses = []
         if peer_addresses:
@@ -236,7 +231,7 @@ def _replica_set_hosts(self):
     @property
     def mongos_config(self) -> MongoDBConfiguration:
         """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB."""
-        return self._get_mongos_config_for_user(OperatorUser, set(self._unit_ips))
+        return self._get_mongos_config_for_user(OperatorUser, set(self.unit_ips))
 
     def remote_mongos_config(self, hosts) -> MongoDBConfiguration:
         """Generates a MongoDBConfiguration object for mongos in the deployment of MongoDB."""
@@ -247,7 +242,7 @@ def remote_mongos_config(self, hosts) -> MongoDBConfiguration:
     @property
     def mongodb_config(self) -> MongoDBConfiguration:
         """Generates a MongoDBConfiguration object for this deployment of MongoDB."""
-        return self._get_mongodb_config_for_user(OperatorUser, set(self._unit_ips))
+        return self._get_mongodb_config_for_user(OperatorUser, set(self.unit_ips))
 
     @property
     def monitor_config(self) -> MongoDBConfiguration:
@@ -327,6 +322,13 @@ def db_initialised(self, value):
                 f"'db_initialised' must be a boolean value. Proivded: {value} is of type {type(value)}"
             )
 
+    @property
+    def upgrade_in_progress(self):
+        """Whether upgrade is in progress."""
+        if not self.upgrade._upgrade:
+            return False
+        return self.upgrade._upgrade.in_progress
+
     # END: properties
 
     # BEGIN: charm event handlers
@@ -358,7 +360,7 @@ def _on_install(self, event: InstallEvent) -> None:
         # Construct the mongod startup commandline args for systemd and reload the daemon.
         update_mongod_service(
             auth=auth,
-            machine_ip=self._unit_ip(self.unit),
+            machine_ip=self.unit_ip(self.unit),
             config=self.mongodb_config,
             role=self.role,
         )
@@ -373,7 +375,15 @@ def _on_config_changed(self, event: ConfigChangedEvent) -> None:
         unresponsive therefore causing a cluster failure, error the component. This prevents it
         from executing other hooks with a new role.
         """
-        if self.upgrade.idle and self.is_role_changed():
+        if self.is_role_changed():
+
+            if self.upgrade_in_progress:
+                logger.warning(
+                    "Changing config options is not permitted during an upgrade. The charm may be in a broken, unrecoverable state."
+                )
+                event.defer()
+                return
+
             # TODO in the future (24.04) support migration of components
             logger.error(
                 f"cluster migration currently not supported, cannot change from { self.model.config['role']} to {self.role}"
@@ -447,8 +457,10 @@ def _on_relation_joined(self, event: RelationJoinedEvent) -> None:
         if not self.unit.is_leader():
             return
 
-        if not self.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
+        if self.upgrade_in_progress:
+            logger.warning(
+                "Adding replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
+            )
             event.defer()
             return
 
@@ -462,8 +474,10 @@ def _on_relation_handler(self, event: RelationEvent) -> None:
         Args:
             event: The triggering relation joined/changed event.
         """
-        if not self.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
+        if self.upgrade_in_progress:
+            logger.warning(
+                "Adding/Removing/Changing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
+            )
             event.defer()
             return
 
@@ -510,11 +524,6 @@ def _on_relation_handler(self, event: RelationEvent) -> None:
 
     def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
         """Generates necessary keyfile and updates replica hosts."""
-        if not self.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
-
         if not self.get_secret(APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME):
             self._generate_secrets()
 
@@ -530,10 +539,13 @@ def _on_relation_departed(self, event: RelationDepartedEvent) -> None:
         if not self.unit.is_leader() or event.departing_unit == self.unit:
             return
 
-        if not self.upgrade.idle:
-            logger.info("cannot process %s, upgrade is in progress", event)
-            event.defer()
-            return
+        if self.upgrade_in_progress:
+            # do not defer or return here, if a user removes a unit, the config will be incorrect
+            # and lead to MongoDB reporting that the replica set is unhealthy, we should make an
+            # attempt to fix the replica set configuration even if an upgrade is occurring.
+            logger.warning(
+                "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
+            )
 
         self._update_hosts(event)
 
@@ -543,6 +555,12 @@ def _on_storage_detaching(self, event: StorageDetachingEvent) -> None:
         If the removing unit is primary also allow it to step down and elect another unit as
         primary while it still has access to its storage.
         """
+        if self.upgrade_in_progress:
+            # We cannot defer and prevent a user from removing a unit, log a warning instead.
+            logger.warning(
+                "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
+            )
+
         # A single replica cannot step down as primary and we cannot reconfigure the replica set to
         # have 0 members.
         if self._is_removing_last_replica:
@@ -566,7 +584,7 @@ def _on_storage_detaching(self, event: StorageDetachingEvent) -> None:
         try:
             # retries over a period of 10 minutes in an attempt to resolve race conditions it is
             # not possible to defer in storage detached.
-            logger.debug("Removing %s from replica set", self._unit_ip(self.unit))
+            logger.debug("Removing %s from replica set", self.unit_ip(self.unit))
             for attempt in Retrying(
                 stop=stop_after_attempt(10),
                 wait=wait_fixed(1),
@@ -575,7 +593,7 @@ def _on_storage_detaching(self, event: StorageDetachingEvent) -> None:
                 with attempt:
                     # remove_replset_member retries for 60 seconds
                     with MongoDBConnection(self.mongodb_config) as mongo:
-                        mongo.remove_replset_member(self._unit_ip(self.unit))
+                        mongo.remove_replset_member(self.unit_ip(self.unit))
         except NotReadyError:
             logger.info(
                 "Failed to remove %s from replica set, another member is syncing", self.unit.name
@@ -584,10 +602,6 @@ def _on_storage_detaching(self, event: StorageDetachingEvent) -> None:
             logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)
 
     def _on_update_status(self, event: UpdateStatusEvent):
-        if not self.upgrade.idle:
-            logger.info("Processing upgrade, wait to check status")
-            return
-
         # user-made mistakes might result in other incorrect statues. Prioritise informing users of
         # their mistake.
         invalid_integration_status = self.get_invalid_integration_status()
@@ -612,11 +626,13 @@ def _on_update_status(self, event: UpdateStatusEvent):
                     self.unit.status = WaitingStatus("Waiting for MongoDB to start")
                     return
 
-        # leader should periodically handle configuring the replica set. Incidents such as network
-        # cuts can lead to new IP addresses and therefore will require a reconfigure. Especially
-        # in the case that the leader a change in IP address it will not receive a relation event.
-        if self.unit.is_leader():
-            self._handle_reconfigure(event)
+        try:
+            self.perform_self_healing(event)
+        except ServerSelectionTimeoutError:
+            # health checks that are performed too early will fail if the hasn't elected a primary
+            # yet. This can occur if the deployment has restarted or is undergoing a re-election
+            deployment_mode = "replica set" if self.is_role(Config.Role.REPLICATION) else "cluster"
+            WaitingStatus(f"Waiting to sync internal membership across the {deployment_mode}")
 
         self.unit.status = self.process_statuses()
 
@@ -637,10 +653,6 @@ def _on_get_password(self, event: ActionEvent) -> None:
 
     def _on_set_password(self, event: ActionEvent) -> None:
         """Set the password for the admin user."""
-        if not self.upgrade.idle:
-            event.fail("Cannot set password, upgrade is in progress.")
-            return
-
         # check conditions for setting the password and fail if necessary
         if not self.pass_pre_set_password_checks(event):
             return
@@ -875,18 +887,23 @@ def pass_pre_set_password_checks(self, event: ActionEvent) -> bool:
         """Checks conditions for setting the password and fail if necessary."""
         if self.is_role(Config.Role.SHARD):
             event.fail("Cannot set password on shard, please set password on config-server.")
-            return
+            return False
 
         # changing the backup password while a backup/restore is in progress can be disastrous
         pbm_status = self.backups.get_pbm_status()
         if isinstance(pbm_status, MaintenanceStatus):
             event.fail("Cannot change password while a backup/restore is in progress.")
-            return
+            return False
 
         # only leader can write the new password into peer relation.
         if not self.unit.is_leader():
             event.fail("The action can be run only on leader unit.")
-            return
+            return False
+
+        if self.upgrade_in_progress:
+            logger.debug("Do not set the password while a backup/restore is in progress.")
+            event.fail("Cannot set passwords while an upgrade is in progress.")
+            return False
 
         return True
 
@@ -913,7 +930,7 @@ def _update_hosts(self, event: LeaderElectedEvent) -> None:
             return
 
         self.process_unremoved_units(event)
-        self.app_peer_data["replica_set_hosts"] = json.dumps(self._unit_ips)
+        self.app_peer_data["replica_set_hosts"] = json.dumps(self.unit_ips)
 
         self._update_related_hosts(event)
 
@@ -943,12 +960,12 @@ def process_unremoved_units(self, event: LeaderElectedEvent) -> None:
                 logger.error("Deferring process_unremoved_units: error=%r", e)
                 event.defer()
 
-    def _handle_reconfigure(self, event: UpdateStatusEvent):
+    def perform_self_healing(self, event: UpdateStatusEvent):
         """Reconfigures the replica set if necessary.
 
-        Removes any mongod hosts that are no longer present in the replica set or adds hosts that
-        should exist in the replica set. This function is meant to be called periodically by the
-        leader in the update status hook to perform any necessary cluster healing.
+        Incidents such as network cuts can lead to new IP addresses and therefore will require a
+        reconfigure. Especially in the case that the leader's IP address changed, it will not
+        receive a relation event.
         """
         if not self.unit.is_leader():
             logger.debug("only the leader can perform reconfigurations to the replica set.")
@@ -962,6 +979,12 @@ def _handle_reconfigure(self, event: UpdateStatusEvent):
         event.unit = self.unit
         self._on_relation_handler(event)
 
+        # make sure all nodes in the replica set have the same priority for re-election. This is
+        # necessary in the case that pre-upgrade hook fails to reset the priority of election for
+        # cluster nodes.
+        with MongoDBConnection(self.mongodb_config) as mongod:
+            mongod.set_replicaset_election_priority(priority=1)
+
     def _open_ports_tcp(self, ports: int) -> None:
         """Open the given port.
 
@@ -1176,7 +1199,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None:
                 logger.info("Replica Set initialization")
                 direct_mongo.init_replset()
                 self.peers.data[self.app]["replica_set_hosts"] = json.dumps(
-                    [self._unit_ip(self.unit)]
+                    [self.unit_ip(self.unit)]
                 )
 
                 logger.info("User initialization")
@@ -1206,7 +1229,7 @@ def _initialise_replica_set(self, event: StartEvent) -> None:
             self.db_initialised = True
             self.unit.status = ActiveStatus()
 
-    def _unit_ip(self, unit: Unit) -> str:
+    def unit_ip(self, unit: Unit) -> str:
         """Returns the ip address of a given unit."""
         # check if host is current host
         if unit == self.unit:
@@ -1302,7 +1325,7 @@ def restart_charm_services(self, auth=None):
             self.stop_charm_services()
             update_mongod_service(
                 auth,
-                self._unit_ip(self.unit),
+                self.unit_ip(self.unit),
                 config=self.mongodb_config,
                 role=self.role,
             )
@@ -1457,7 +1480,7 @@ def get_invalid_integration_status(self) -> Optional[StatusBase]:
 
     def get_statuses(self) -> Tuple:
         """Retrieves statuses for the different processes running inside the unit."""
-        mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit))
+        mongodb_status = build_unit_status(self.mongodb_config, self.unit_ip(self.unit))
         shard_status = self.shard.get_shard_status()
         config_server_status = self.config_server.get_config_server_status()
         pbm_status = self.backups.get_pbm_status()
diff --git a/src/config.py b/src/config.py
index 7d164cd71..c2e0a118f 100644
--- a/src/config.py
+++ b/src/config.py
@@ -20,15 +20,6 @@ class Config:
     MONGOD_CONF_DIR = f"{MONGODB_SNAP_DATA_DIR}/etc/mongod"
     MONGOD_CONF_FILE_PATH = f"{MONGOD_CONF_DIR}/mongod.conf"
     SNAP_PACKAGES = [("charmed-mongodb", "6/edge", 118)]
-    DEPENDENCIES = {
-        "mongod_service": {
-            "dependencies": {},
-            "name": "mongod",
-            "upgrade_supported": "^6.0.0,<7",
-            "version": "6.0.6",
-        },
-        # TODO: Future PR - implement mongos deps when supporting sharding upgrades
-    }
 
     # Keep these alphabetically sorted
     class Actions:
diff --git a/src/events/upgrade.py b/src/events/upgrade.py
deleted file mode 100644
index f1522c76d..000000000
--- a/src/events/upgrade.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# Copyright 2024 Canonical Ltd.
-# See LICENSE file for licensing details.
-
-"""Manager for handling MongoDB in-place upgrades."""
-
-import logging
-import secrets
-import string
-from typing import Tuple
-
-from charms.data_platform_libs.v0.upgrade import (
-    ClusterNotReadyError,
-    DataUpgrade,
-    DependencyModel,
-    UpgradeGrantedEvent,
-)
-from charms.mongodb.v0.mongodb import MongoDBConfiguration, MongoDBConnection
-from charms.operator_libs_linux.v2 import snap
-from ops.charm import CharmBase
-from ops.model import ActiveStatus
-from pydantic import BaseModel
-from tenacity import Retrying, retry, stop_after_attempt, wait_fixed
-from typing_extensions import override
-
-from config import Config
-
-logger = logging.getLogger(__name__)
-
-WRITE_KEY = "write_value"
-MONGOD_SERVICE = "mongod"
-
-
-ROLLBACK_INSTRUCTIONS = """Unit failed to upgrade and requires manual rollback to previous stable version.
-    1. Re-run `pre-upgrade-check` action on the leader unit to enter 'recovery' state
-    2. Run `juju refresh` to the previously deployed charm revision
-"""
-
-
-class FailedToElectNewPrimaryError(Exception):
-    """Raised when a new primary isn't elected after stepping down."""
-
-
-class MongoDBDependencyModel(BaseModel):
-    """Model for MongoDB Operator dependencies."""
-
-    mongod_service: DependencyModel
-    # in future have a mongos service here too
-
-
-class MongoDBUpgrade(DataUpgrade):
-    """Implementation of :class:`DataUpgrade` overrides for in-place upgrades."""
-
-    def __init__(self, charm: CharmBase, **kwargs):
-        super().__init__(charm, **kwargs)
-        self.charm = charm
-
-    @property
-    def idle(self) -> bool:
-        """Checks if cluster has completed upgrade.
-
-        Returns:
-            True if cluster has completed upgrade. Otherwise False
-        """
-        return not bool(self.upgrade_stack)
-
-    @override
-    def pre_upgrade_check(self) -> None:
-        """Verifies that an upgrade can be done on the MongoDB deployment."""
-        default_message = "Pre-upgrade check failed and cannot safely upgrade"
-
-        if self.charm.is_role(Config.Role.SHARD):
-            raise ClusterNotReadyError(
-                message=default_message,
-                cause="Cannot run pre-upgrade check on shards",
-                resolution="Run this action on config-server.",
-            )
-
-        if not self.is_cluster_healthy():
-            raise ClusterNotReadyError(
-                message=default_message,
-                cause="Cluster is not healthy",
-                resolution="Please check juju status for information",
-            )
-
-        if not self.is_cluster_able_to_read_write():
-            raise ClusterNotReadyError(
-                message=default_message, cause="Cluster cannot read/write - please check logs"
-            )
-
-        # Future PR - sharding based checks
-
-    @retry(
-        stop=stop_after_attempt(20),
-        wait=wait_fixed(1),
-        reraise=True,
-    )
-    def post_upgrade_check(self) -> None:
-        """Runs necessary checks validating the unit is in a healthy state after upgrade."""
-        if not self.is_cluster_able_to_read_write():
-            raise ClusterNotReadyError(
-                message="post-upgrade check failed and cannot safely upgrade",
-                cause="Cluster cannot read/write",
-            )
-
-    @override
-    def build_upgrade_stack(self) -> list[int]:
-        """Builds an upgrade stack, specifying the order of nodes to upgrade."""
-        if self.charm.is_role(Config.Role.CONFIG_SERVER):
-            # TODO implement in a future PR a stack for shards and config server
-            pass
-        elif self.charm.is_role(Config.Role.REPLICATION):
-            return self.get_replica_set_upgrade_stack()
-
-    def get_replica_set_upgrade_stack(self) -> list[int]:
-        """Builds an upgrade stack, specifying the order of nodes to upgrade.
-
-        MongoDB Specific: The primary should be upgraded last, so the unit with the primary is
-        put at the very bottom of the stack.
-        """
-        upgrade_stack = []
-        units = set([self.charm.unit] + list(self.charm.peers.units))  # type: ignore[reportOptionalMemberAccess]
-        primary_unit_id = None
-        for unit in units:
-            unit_id = int(unit.name.split("/")[-1])
-            if unit.name == self.charm.primary:
-                primary_unit_id = unit_id
-                continue
-
-            upgrade_stack.append(unit_id)
-
-        upgrade_stack.insert(0, primary_unit_id)
-        return upgrade_stack
-
-    @override
-    def log_rollback_instructions(self) -> None:
-        """Logs the rollback instructions in case of failure to upgrade."""
-        logger.critical(ROLLBACK_INSTRUCTIONS)
-
-    @override
-    def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
-        """Execute a series of upgrade steps."""
-        # TODO: Future PR - check compatibility of new mongod version with current mongos versions
-        self.charm.stop_charm_services()
-
-        try:
-            self.charm.install_snap_packages(packages=Config.SNAP_PACKAGES)
-        except snap.SnapError:
-            logger.error("Unable to install Snap")
-            self.set_unit_failed()
-            return
-
-        if self.charm.unit.name == self.charm.primary:
-            logger.debug("Stepping down current primary, before upgrading service...")
-            self.step_down_primary_and_wait_reelection()
-
-        logger.info(f"{self.charm.unit.name} upgrading service...")
-        self.charm.restart_charm_services()
-
-        try:
-            logger.debug("Running post-upgrade check...")
-            self.post_upgrade_check()
-
-            logger.debug("Marking unit completed...")
-            self.set_unit_completed()
-
-            # ensures leader gets it's own relation-changed when it upgrades
-            if self.charm.unit.is_leader():
-                logger.debug("Re-emitting upgrade-changed on leader...")
-                self.on_upgrade_changed(event)
-
-        except ClusterNotReadyError as e:
-            logger.error(e.cause)
-            self.set_unit_failed()
-
-    def step_down_primary_and_wait_reelection(self) -> bool:
-        """Steps down the current primary and waits for a new one to be elected."""
-        old_primary = self.charm.primary
-        with MongoDBConnection(self.charm.mongodb_config) as mongod:
-            mongod.step_down_primary()
-
-        for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(1), reraise=True):
-            with attempt:
-                new_primary = self.charm.primary
-                if new_primary != old_primary:
-                    raise FailedToElectNewPrimaryError()
-
-    def is_cluster_healthy(self) -> bool:
-        """Returns True if all nodes in the cluster/replcia set are healthy."""
-        if self.charm.is_role(Config.Role.SHARD):
-            logger.debug("Cannot run full cluster health check on shards")
-            return False
-
-        charm_status = self.charm.process_statuses()
-        return self.are_nodes_healthy() and isinstance(charm_status, ActiveStatus)
-
-    def are_nodes_healthy(self) -> bool:
-        """Returns True if all nodes in the MongoDB deployment are healthy."""
-        if self.charm.is_role(Config.Role.CONFIG_SERVER):
-            # TODO future PR implement this
-            pass
-
-        if self.charm.is_role(Config.Role.REPLICATION):
-            with MongoDBConnection(self.charm.mongodb_config) as mongod:
-                rs_status = mongod.get_replset_status()
-                rs_status = mongod.client.admin.command("replSetGetStatus")
-                return not mongod.is_any_sync(rs_status)
-
-    def is_cluster_able_to_read_write(self) -> bool:
-        """Returns True if read and write is feasible for cluster."""
-        if self.charm.is_role(Config.Role.SHARD):
-            logger.debug("Cannot run read/write check on shard, must run via config-server.")
-            return False
-        elif self.charm.is_role(Config.Role.CONFIG_SERVER):
-            return self.is_sharded_cluster_able_to_read_write()
-        else:
-            return self.is_replica_set_able_read_write()
-
-    def is_replica_set_able_read_write(self) -> bool:
-        """Returns True if is possible to write to primary and read from replicas."""
-        collection_name, write_value = self.get_random_write_and_collection()
-        # add write to primary
-        self.add_write(self.charm.mongodb_config, collection_name, write_value)
-
-        # verify writes on secondaries
-        with MongoDBConnection(self.charm.mongodb_config) as mongod:
-            primary_ip = mongod.primary()
-
-        replica_ips = set(self.charm._unit_ips)
-        secondary_ips = replica_ips - set(primary_ip)
-        for secondary_ip in secondary_ips:
-            if not self.is_excepted_write_on_replica(secondary_ip, collection_name, write_value):
-                # do not return False immediately - as it is
-                logger.debug("Secondary with IP %s, does not contain the expected write.")
-                self.clear_tmp_collection(self.charm.mongodb_config, collection_name)
-                return False
-
-        self.clear_tmp_collection(self.charm.mongodb_config, collection_name)
-        return True
-
-    def is_sharded_cluster_able_to_read_write(self) -> bool:
-        """Returns True if is possible to write each shard and read value from all nodes.
-
-        TODO: Implement in a future PR.
-        """
-        return False
-
-    def clear_tmp_collection(
-        self, mongodb_config: MongoDBConfiguration, collection_name: str
-    ) -> None:
-        """Clears the temporary collection."""
-        with MongoDBConnection(mongodb_config) as mongod:
-            db = mongod.client["admin"]
-            db.drop_collection(collection_name)
-
-    def is_excepted_write_on_replica(
-        self, host: str, collection: str, expected_write_value: str
-    ) -> bool:
-        """Returns True if the replica contains the expected write in the provided collection."""
-        secondary_config = self.charm.mongodb_config
-        secondary_config.hosts = {host}
-        with MongoDBConnection(secondary_config, direct=True) as direct_seconary:
-            db = direct_seconary.client["admin"]
-            test_collection = db[collection]
-            query = test_collection.find({}, {WRITE_KEY: 1})
-            return query[0][WRITE_KEY] == expected_write_value
-
-    def get_random_write_and_collection(self) -> Tuple[str, str]:
-        """Returns a tutple for a random collection name and a unique write to add to it."""
-        choices = string.ascii_letters + string.digits
-        collection_name = "collection_" + "".join([secrets.choice(choices) for _ in range(16)])
-        write_value = "unique_write_" + "".join([secrets.choice(choices) for _ in range(16)])
-        return (collection_name, write_value)
-
-    def add_write(
-        self, mongodb_config: MongoDBConfiguration, collection_name, write_value
-    ) -> None:
-        """Adds a the provided write to the admin database with the provided collection."""
-        with MongoDBConnection(mongodb_config) as mongod:
-            db = mongod.client["admin"]
-            test_collection = db[collection_name]
-            write = {WRITE_KEY: write_value}
-            test_collection.insert_one(write)
diff --git a/src/status_exception.py b/src/status_exception.py
new file mode 100644
index 000000000..fdd962cdc
--- /dev/null
+++ b/src/status_exception.py
@@ -0,0 +1,14 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Class for exception with ops status."""
+
+from ops import StatusBase
+
+
+class StatusException(Exception):
+    """Exception with ops status."""
+
+    def __init__(self, status: StatusBase) -> None:
+        super().__init__(status.message)
+        self.status = status
diff --git a/src/upgrades/machine_upgrade.py b/src/upgrades/machine_upgrade.py
new file mode 100644
index 000000000..0c3ca0a1e
--- /dev/null
+++ b/src/upgrades/machine_upgrade.py
@@ -0,0 +1,198 @@
+# Copyright 2023 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""In-place upgrades on machines.
+
+Derived from specification: DA058 - In-Place Upgrades - Kubernetes v2
+(https://docs.google.com/document/d/1tLjknwHudjcHs42nzPVBNkHs98XxAOT2BXGGpP7NyEU/)
+"""
+import json
+import logging
+import time
+import typing
+
+import ops
+
+from config import Config
+from upgrades import mongodb_upgrade, upgrade
+
+logger = logging.getLogger(__name__)
+
+_SNAP_REVISION = str(Config.SNAP_PACKAGES[0][2])
+
+
+class Upgrade(upgrade.Upgrade):
+    """In-place upgrades on machines."""
+
+    @property
+    def unit_state(self) -> typing.Optional[upgrade.UnitState]:
+        """Returns the unit state."""
+        if (
+            self._unit_workload_container_version is not None
+            and self._unit_workload_container_version != self._app_workload_container_version
+        ):
+            logger.debug("Unit upgrade state: outdated")
+            return upgrade.UnitState.OUTDATED
+        return super().unit_state
+
+    @unit_state.setter
+    def unit_state(self, value: upgrade.UnitState) -> None:
+        # Super call
+        upgrade.Upgrade.unit_state.fset(self, value)
+
+    def _get_unit_healthy_status(self) -> ops.StatusBase:
+        if self._unit_workload_container_version == self._app_workload_container_version:
+            return ops.ActiveStatus(
+                f'MongoDB {self._unit_workload_version} running; Snap rev {self._unit_workload_container_version}; Charmed operator {self._current_versions["charm"]}'
+            )
+        return ops.ActiveStatus(
+            f'MongoDB {self._unit_workload_version} running; Snap rev {self._unit_workload_container_version} (outdated); Charmed operator {self._current_versions["charm"]}'
+        )
+
+    @property
+    def app_status(self) -> typing.Optional[ops.StatusBase]:
+        """App upgrade status."""
+        if not self.is_compatible:
+            logger.info(
+                "Upgrade incompatible. If you accept potential *data loss* and *downtime*, you can continue by running `force-upgrade` action on each remaining unit"
+            )
+            return ops.BlockedStatus(
+                "Upgrade incompatible. Rollback to previous revision with `juju refresh`"
+            )
+        return super().app_status
+
+    @property
+    def _unit_workload_container_versions(self) -> typing.Dict[str, str]:
+        """{Unit name: installed snap revision}."""
+        versions = {}
+        for unit in self._sorted_units:
+            if version := (self._peer_relation.data[unit].get("snap_revision")):
+                versions[unit.name] = version
+        return versions
+
+    @property
+    def _unit_workload_container_version(self) -> typing.Optional[str]:
+        """Installed snap revision for this unit."""
+        return self._unit_databag.get("snap_revision")
+
+    @_unit_workload_container_version.setter
+    def _unit_workload_container_version(self, value: str):
+        self._unit_databag["snap_revision"] = value
+
+    @property
+    def _app_workload_container_version(self) -> str:
+        """Snap revision for current charm code."""
+        return _SNAP_REVISION
+
+    @property
+    def _unit_workload_version(self) -> typing.Optional[str]:
+        """Installed OpenSearch version for this unit."""
+        return self._unit_databag.get("workload_version")
+
+    @_unit_workload_version.setter
+    def _unit_workload_version(self, value: str):
+        self._unit_databag["workload_version"] = value
+
+    def reconcile_partition(self, *, action_event: ops.ActionEvent = None) -> None:
+        """Handle Juju action to confirm first upgraded unit is healthy and resume upgrade."""
+        if action_event:
+            self.upgrade_resumed = True
+            message = "Upgrade resumed."
+            action_event.set_results({"result": message})
+            logger.debug(f"Resume upgrade event succeeded: {message}")
+
+    @property
+    def upgrade_resumed(self) -> bool:
+        """Whether user has resumed upgrade with Juju action.
+
+        Reset to `False` after each `juju refresh`
+        """
+        return json.loads(self._app_databag.get("upgrade-resumed", "false"))
+
+    @upgrade_resumed.setter
+    def upgrade_resumed(self, value: bool):
+        # Trigger peer relation_changed event even if value does not change
+        # (Needed when leader sets value to False during `ops.UpgradeCharmEvent`)
+        self._app_databag["-unused-timestamp-upgrade-resume-last-updated"] = str(time.time())
+
+        self._app_databag["upgrade-resumed"] = json.dumps(value)
+        logger.debug(f"Set upgrade-resumed to {value=}")
+
+    @property
+    def authorized(self) -> bool:
+        """Whether this unit is authorized to upgrade.
+
+        Only applies to machine charm.
+
+        Raises:
+            PrecheckFailed: App is not ready to upgrade
+        """
+        assert self._unit_workload_container_version != self._app_workload_container_version
+        assert self.versions_set
+        for index, unit in enumerate(self._sorted_units):
+            if unit.name == self._unit.name:
+                # Higher number units have already upgraded
+                if index == 0:
+                    if (
+                        json.loads(self._app_databag["versions"])["charm"]
+                        == self._current_versions["charm"]
+                    ):
+                        # Assumes charm version uniquely identifies charm revision
+                        logger.debug("Rollback detected. Skipping pre-upgrade check")
+                    else:
+                        # Run pre-upgrade check
+                        # (in case user forgot to run pre-upgrade-check action)
+                        self.pre_upgrade_check()
+                        logger.debug("Pre-upgrade check after `juju refresh` successful")
+                elif index == 1:
+                    # User confirmation needed to resume upgrade (i.e. upgrade second unit)
+                    logger.debug(f"Second unit authorized to upgrade if {self.upgrade_resumed=}")
+                    return self.upgrade_resumed
+                return True
+            state = self._peer_relation.data[unit].get("state")
+            if state:
+                state = upgrade.UnitState(state)
+            if (
+                self._unit_workload_container_versions.get(unit.name)
+                != self._app_workload_container_version
+                or state is not upgrade.UnitState.HEALTHY
+            ):
+                # Waiting for higher number units to upgrade
+                return False
+        return False
+
+    def upgrade_unit(self, *, charm) -> None:
+        """Runs the upgrade procedure.
+
+        Only applies to machine charm.
+        """
+        # According to the MongoDB documentation, before upgrading the primary, we must ensure a
+        # safe primary re-election.
+        try:
+            if self._unit.name == charm.primary and len(self._sorted_units) > 1:
+                logger.debug("Stepping down current primary, before upgrading service...")
+                charm.upgrade.step_down_primary_and_wait_reelection()
+        except mongodb_upgrade.FailedToElectNewPrimaryError:
+            # by not setting the snap revision and immediately returning, this function will be
+            # called again, and an empty re-elect a primary will occur again.
+            logger.error("Failed to reelect primary before upgrading unit.")
+            return
+
+        logger.debug(f"Upgrading {self.authorized=}")
+        self.unit_state = upgrade.UnitState.UPGRADING
+        charm.install_snap_packages(packages=Config.SNAP_PACKAGES)
+        self._unit_databag["snap_revision"] = _SNAP_REVISION
+        self._unit_workload_version = self._current_versions["workload"]
+        logger.debug(f"Saved {_SNAP_REVISION} in unit databag after upgrade")
+
+        # post upgrade check should be retried in case of failure, for this it is necessary to
+        # emit a separate event.
+        charm.upgrade.post_upgrade_event.emit()
+
+    def save_snap_revision_after_first_install(self):
+        """Set snap revision on first install."""
+        self._unit_workload_container_version = _SNAP_REVISION
+        self._unit_workload_version = self._current_versions["workload"]
+        logger.debug(
+            f'Saved {_SNAP_REVISION=} and {self._current_versions["workload"]=} in unit databag after first install'
+        )
diff --git a/src/upgrades/mongodb_upgrade.py b/src/upgrades/mongodb_upgrade.py
new file mode 100644
index 000000000..1b0f6fc78
--- /dev/null
+++ b/src/upgrades/mongodb_upgrade.py
@@ -0,0 +1,432 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Manager for handling MongoDB in-place upgrades."""
+
+import logging
+import secrets
+import string
+from typing import Optional, Tuple
+
+from charms.mongodb.v0.mongodb import MongoDBConfiguration, MongoDBConnection
+from ops.charm import ActionEvent, CharmBase
+from ops.framework import EventBase, EventSource, Object
+from ops.model import ActiveStatus, BlockedStatus
+from pymongo.errors import OperationFailure, PyMongoError, ServerSelectionTimeoutError
+from tenacity import RetryError, Retrying, retry, stop_after_attempt, wait_fixed
+
+from config import Config
+from upgrades import machine_upgrade, upgrade
+
+logger = logging.getLogger(__name__)
+
+
+WRITE_KEY = "write_value"
+ROLLBACK_INSTRUCTIONS = "To rollback, `juju refresh` to the previous revision"
+UNHEALTHY_UPGRADE = BlockedStatus("Unhealthy after upgrade.")
+
+
+# BEGIN: Exceptions
+class FailedToElectNewPrimaryError(Exception):
+    """Raised when a new primary isn't elected after stepping down."""
+
+
+class ClusterNotHealthyError(Exception):
+    """Raised when the cluster is not healthy."""
+
+
+# END: Exceptions
+
+
+class _PostUpgradeCheckMongoDB(EventBase):
+    """Run post upgrade check on MongoDB to verify that the cluster is healhty."""
+
+    def __init__(self, handle):
+        super().__init__(handle)
+
+
+class MongoDBUpgrade(Object):
+    """Handlers for upgrade events."""
+
+    post_upgrade_event = EventSource(_PostUpgradeCheckMongoDB)
+
+    def __init__(self, charm: CharmBase):
+        self.charm = charm
+        super().__init__(charm, upgrade.PEER_RELATION_ENDPOINT_NAME)
+        self.framework.observe(
+            charm.on[upgrade.PRECHECK_ACTION_NAME].action, self._on_pre_upgrade_check_action
+        )
+
+        self.framework.observe(
+            charm.on[upgrade.PEER_RELATION_ENDPOINT_NAME].relation_created,
+            self._on_upgrade_peer_relation_created,
+        )
+        self.framework.observe(
+            charm.on[upgrade.PEER_RELATION_ENDPOINT_NAME].relation_changed, self._reconcile_upgrade
+        )
+        self.framework.observe(charm.on.upgrade_charm, self._on_upgrade_charm)
+        self.framework.observe(
+            charm.on[upgrade.RESUME_ACTION_NAME].action, self._on_resume_upgrade_action
+        )
+        self.framework.observe(charm.on["force-upgrade"].action, self._on_force_upgrade_action)
+        self.framework.observe(self.post_upgrade_event, self.post_upgrade_check)
+
+    # BEGIN: Event handlers
+    def _on_upgrade_peer_relation_created(self, _) -> None:
+        self._upgrade.save_snap_revision_after_first_install()
+        if self.charm.unit.is_leader():
+            if not self._upgrade.in_progress:
+                # Save versions on initial start
+                self._upgrade.set_versions_in_app_databag()
+
+    def _reconcile_upgrade(self, _=None):
+        """Handle upgrade events."""
+        if not self._upgrade:
+            logger.debug("Peer relation not available")
+            return
+        if not self._upgrade.versions_set:
+            logger.debug("Peer relation not ready")
+            return
+        if self.charm.unit.is_leader() and not self._upgrade.in_progress:
+            # Run before checking `self._upgrade.is_compatible` in case incompatible upgrade was
+            # forced & completed on all units.
+            self._upgrade.set_versions_in_app_databag()
+        if not self._upgrade.is_compatible:
+            self._set_upgrade_status()
+            return
+        if self._upgrade.unit_state is upgrade.UnitState.OUTDATED:
+            try:
+                authorized = self._upgrade.authorized
+            except upgrade.PrecheckFailed as exception:
+                self._set_upgrade_status()
+                self.unit.status = exception.status
+                logger.debug(f"Set unit status to {self.unit.status}")
+                logger.error(exception.status.message)
+                return
+            if authorized:
+                self._set_upgrade_status()
+                self._upgrade.upgrade_unit(charm=self.charm)
+            else:
+                self._set_upgrade_status()
+                logger.debug("Waiting to upgrade")
+                return
+        self._set_upgrade_status()
+
+    def _on_upgrade_charm(self, _):
+        if self.charm.unit.is_leader():
+            if not self._upgrade.in_progress:
+                logger.info("Charm upgraded. MongoDB version unchanged")
+            self._upgrade.upgrade_resumed = False
+            # Only call `_reconcile_upgrade` on leader unit to avoid race conditions with
+            # `upgrade_resumed`
+            self._reconcile_upgrade()
+
+    def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None:
+        if not self.charm.unit.is_leader():
+            message = f"Must run action on leader unit. (e.g. `juju run {self.app.name}/leader {upgrade.PRECHECK_ACTION_NAME}`)"
+            logger.debug(f"Pre-upgrade check event failed: {message}")
+            event.fail(message)
+            return
+        if not self._upgrade or self._upgrade.in_progress:
+            message = "Upgrade already in progress"
+            logger.debug(f"Pre-upgrade check event failed: {message}")
+            event.fail(message)
+            return
+        try:
+            self._upgrade.pre_upgrade_check()
+        except upgrade.PrecheckFailed as exception:
+            message = (
+                f"Charm is *not* ready for upgrade. Pre-upgrade check failed: {exception.message}"
+            )
+            logger.debug(f"Pre-upgrade check event failed: {message}")
+            event.fail(message)
+            return
+        message = "Charm is ready for upgrade"
+        event.set_results({"result": message})
+        logger.debug(f"Pre-upgrade check event succeeded: {message}")
+
+    def _on_resume_upgrade_action(self, event: ActionEvent) -> None:
+        if not self.charm.unit.is_leader():
+            message = f"Must run action on leader unit. (e.g. `juju run {self.charm.app.name}/leader {upgrade.RESUME_ACTION_NAME}`)"
+            logger.debug(f"Resume upgrade event failed: {message}")
+            event.fail(message)
+            return
+        if not self._upgrade or not self._upgrade.in_progress:
+            message = "No upgrade in progress"
+            logger.debug(f"Resume upgrade event failed: {message}")
+            event.fail(message)
+            return
+        self._upgrade.reconcile_partition(action_event=event)
+
+    def _on_force_upgrade_action(self, event: ActionEvent) -> None:
+        if not self._upgrade or not self._upgrade.in_progress:
+            message = "No upgrade in progress"
+            logger.debug(f"Force upgrade event failed: {message}")
+            event.fail(message)
+            return
+        if not self._upgrade.upgrade_resumed:
+            message = f"Run `juju run {self.charm.app.name}/leader resume-upgrade` before trying to force upgrade"
+            logger.debug(f"Force upgrade event failed: {message}")
+            event.fail(message)
+            return
+        if self._upgrade.unit_state != "outdated":
+            message = "Unit already upgraded"
+            logger.debug(f"Force upgrade event failed: {message}")
+            event.fail(message)
+            return
+        logger.debug("Forcing upgrade")
+        event.log(f"Forcefully upgrading {self.charm.unit.name}")
+        self._upgrade.upgrade_unit(charm=self.charm)
+        event.set_results({"result": f"Forcefully upgraded {self.charm.unit.name}"})
+        logger.debug("Forced upgrade")
+
+    def post_upgrade_check(self, event: EventBase):
+        """Runs the post upgrade check to verify that the cluster is healthy.
+
+        By deferring before setting unit state to HEALTHY, the user will either:
+            1. have to wait for the unit to resolve itself.
+            2. have to run the force-upgrade action (to upgrade the next unit).
+        """
+        logger.debug("Running post upgrade checks to verify cluster is not broken after upgrade")
+
+        try:
+            self.wait_for_cluster_healthy()
+        except RetryError:
+            logger.error(
+                "Cluster is not healthy after upgrading unit %s. Will retry next juju event.",
+                self.charm.unit.name,
+            )
+            logger.info(ROLLBACK_INSTRUCTIONS)
+            self.charm.unit.status = UNHEALTHY_UPGRADE
+            event.defer()
+            return
+
+        if not self.is_cluster_able_to_read_write():
+            logger.error(
+                "Cluster is not healthy after upgrading unit %s, writes not propagated throughout cluster. Deferring post upgrade check.",
+                self.charm.unit.name,
+            )
+            logger.info(ROLLBACK_INSTRUCTIONS)
+            self.charm.unit.status = UNHEALTHY_UPGRADE
+            event.defer()
+            return
+
+        if self.charm.unit.status == UNHEALTHY_UPGRADE:
+            self.charm.unit.status = ActiveStatus()
+
+        self._upgrade.unit_state = upgrade.UnitState.HEALTHY
+        logger.debug("Cluster is healthy after upgrading unit %s", self.charm.unit.name)
+
+    # END: Event handlers
+
+    # BEGIN: Helpers
+    def move_primary_to_last_upgrade_unit(self) -> None:
+        """Moves the primary to last unit that gets upgraded (the unit with the lowest id).
+
+        Raises FailedToMovePrimaryError
+        """
+        # no need to move primary in the scenario of one unit
+        if len(self._upgrade._sorted_units) < 2:
+            return
+
+        with MongoDBConnection(self.charm.mongodb_config) as mongod:
+            unit_with_lowest_id = self._upgrade._sorted_units[-1]
+            if mongod.primary() == self.charm.unit_ip(unit_with_lowest_id):
+                logger.debug(
+                    "Not moving Primary before upgrade, primary is already on the last unit to upgrade."
+                )
+                return
+
+            logger.debug("Moving primary to unit: %s", unit_with_lowest_id)
+            mongod.move_primary(new_primary_ip=self.charm.unit_ip(unit_with_lowest_id))
+
+    def _set_upgrade_status(self):
+        # In the future if we decide to support app statuses, we will need to handle this
+        # differently. Specifically ensuring that upgrade status for apps status has the lowest
+        # priority
+        if self.charm.unit.is_leader():
+            self.charm.app.status = self._upgrade.app_status or ActiveStatus()
+
+        # Set/clear upgrade unit status if no other unit status - upgrade status for units should
+        # have the lowest priority.
+        if isinstance(self.charm.unit.status, ActiveStatus) or (
+            isinstance(self.charm.unit.status, BlockedStatus)
+            and self.charm.unit.status.message.startswith(
+                "Rollback with `juju refresh`. Pre-upgrade check failed:"
+            )
+        ):
+            self.charm.unit.status = self._upgrade.get_unit_juju_status() or ActiveStatus()
+
+    def wait_for_cluster_healthy(self) -> None:
+        """Waits until the cluster is healthy after upgrading.
+
+        After a unit restarts it can take some time for the cluster to settle.
+
+        Raises:
+            ClusterNotHealthyError.
+        """
+        for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(1)):
+            with attempt:
+                if not self.is_cluster_healthy():
+                    raise ClusterNotHealthyError()
+
+    def is_cluster_healthy(self) -> bool:
+        """Returns True if all nodes in the cluster/replcia set are healthy."""
+        with MongoDBConnection(
+            self.charm.mongodb_config, "localhost", direct=True
+        ) as direct_mongo:
+            if not direct_mongo.is_ready:
+                logger.error("Cannot proceed with upgrade. Service mongod is not running")
+                return False
+
+        # unit status should be
+        unit_state = self.charm.process_statuses()
+        if not isinstance(unit_state, ActiveStatus):
+            logger.error(
+                "Cannot proceed with upgrade. Unit is not in Active state, in: %s.", unit_state
+            )
+            return False
+
+        try:
+            if self.charm.is_role(Config.Role.CONFIG_SERVER) or self.charm.is_role(
+                Config.Role.SHARD
+            ):
+                # TODO Future PR - implement node healthy check for entire cluster
+                return False
+            if self.charm.is_role(Config.Role.REPLICATION):
+                return self.are_replica_set_nodes_healthy(self.charm.mongodb_config)
+        except (PyMongoError, OperationFailure, ServerSelectionTimeoutError) as e:
+            logger.error(
+                "Cannot proceed with upgrade. Failed to check cluster health, error: %s", e
+            )
+            return False
+
+    def are_replica_set_nodes_healthy(self, mongodb_config: MongoDBConfiguration) -> bool:
+        """Returns true if all nodes in the MongoDB replica set are healthy."""
+        with MongoDBConnection(mongodb_config) as mongod:
+            rs_status = mongod.get_replset_status()
+            rs_status = mongod.client.admin.command("replSetGetStatus")
+            return not mongod.is_any_sync(rs_status)
+
+    def is_cluster_able_to_read_write(self) -> bool:
+        """Returns True if read and write is feasible for cluster."""
+        if self.charm.is_role(Config.Role.SHARD):
+            logger.debug("Cannot run read/write check on shard, must run via config-server.")
+            return False
+        elif self.charm.is_role(Config.Role.CONFIG_SERVER):
+            # TODO Future PR - implement node healthy check for sharded cluster
+            pass
+        else:
+            return self.is_replica_set_able_read_write()
+
+    def is_replica_set_able_read_write(self) -> bool:
+        """Returns True if is possible to write to primary and read from replicas."""
+        collection_name, write_value = self.get_random_write_and_collection()
+        self.add_write_to_replica_set(self.charm.mongodb_config, collection_name, write_value)
+        write_replicated = self.is_write_on_secondaries(
+            self.charm.mongodb_config, collection_name, write_value
+        )
+        self.clear_tmp_collection(self.charm.mongodb_config, collection_name)
+        return write_replicated
+
+    def clear_tmp_collection(
+        self, mongodb_config: MongoDBConfiguration, collection_name: str
+    ) -> None:
+        """Clears the temporary collection."""
+        with MongoDBConnection(mongodb_config) as mongod:
+            db = mongod.client["admin"]
+            db.drop_collection(collection_name)
+
+    @retry(
+        stop=stop_after_attempt(10),
+        wait=wait_fixed(1),
+        reraise=True,
+    )
+    def confirm_excepted_write_on_replica(
+        self,
+        host: str,
+        db_name: str,
+        collection: str,
+        expected_write_value: str,
+        secondary_config: MongoDBConfiguration,
+    ) -> bool:
+        """Returns True if the replica contains the expected write in the provided collection."""
+        secondary_config.hosts = {host}
+        with MongoDBConnection(secondary_config, direct=True) as direct_seconary:
+            db = direct_seconary.client[db_name]
+            test_collection = db[collection]
+            query = test_collection.find({}, {WRITE_KEY: 1})
+            if query[0][WRITE_KEY] != expected_write_value:
+                raise ClusterNotHealthyError
+
+    def get_random_write_and_collection(self) -> Tuple[str, str]:
+        """Returns a tuple for a random collection name and a unique write to add to it."""
+        choices = string.ascii_letters + string.digits
+        collection_name = "collection_" + "".join([secrets.choice(choices) for _ in range(32)])
+        write_value = "unique_write_" + "".join([secrets.choice(choices) for _ in range(16)])
+        return (collection_name, write_value)
+
+    def add_write_to_replica_set(
+        self, mongodb_config: MongoDBConfiguration, collection_name, write_value
+    ) -> None:
+        """Adds a the provided write to the admin database with the provided collection."""
+        with MongoDBConnection(mongodb_config) as mongod:
+            db = mongod.client["admin"]
+            test_collection = db[collection_name]
+            write = {WRITE_KEY: write_value}
+            test_collection.insert_one(write)
+
+    def is_write_on_secondaries(
+        self,
+        mongodb_config: MongoDBConfiguration,
+        collection_name,
+        expected_write_value,
+        db_name: str = "admin",
+    ):
+        """Returns true if the expected write."""
+        with MongoDBConnection(mongodb_config) as mongod:
+            primary_ip = mongod.primary()
+
+        replica_ips = mongodb_config.hosts
+        secondary_ips = replica_ips - set(primary_ip)
+        for secondary_ip in secondary_ips:
+            try:
+                self.confirm_excepted_write_on_replica(
+                    secondary_ip, db_name, collection_name, expected_write_value, mongodb_config
+                )
+            except ClusterNotHealthyError:
+                # do not return False immediately - as it is
+                logger.debug("Secondary with IP %s, does not contain the expected write.")
+                return False
+
+        return True
+
+    def step_down_primary_and_wait_reelection(self) -> None:
+        """Steps down the current primary and waits for a new one to be elected."""
+        if len(self.charm.mongodb_config.hosts) < 2:
+            logger.warning(
+                "No secondaries to become primary - upgrading primary without electing a new one, expect downtime."
+            )
+            return
+
+        old_primary = self.charm.primary
+        with MongoDBConnection(self.charm.mongodb_config) as mongod:
+            mongod.step_down_primary()
+
+        for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(1), reraise=True):
+            with attempt:
+                new_primary = self.charm.primary
+                if new_primary == old_primary:
+                    raise FailedToElectNewPrimaryError()
+
+    # END: helpers
+
+    # BEGIN: properties
+    @property
+    def _upgrade(self) -> Optional[machine_upgrade.Upgrade]:
+        try:
+            return machine_upgrade.Upgrade(self.charm)
+        except upgrade.PeerRelationNotReady:
+            pass
+
+    # END: properties
diff --git a/src/upgrades/upgrade.py b/src/upgrades/upgrade.py
new file mode 100644
index 000000000..de8322efd
--- /dev/null
+++ b/src/upgrades/upgrade.py
@@ -0,0 +1,302 @@
+# Copyright 2023 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""In-place upgrades.
+
+Based off specification: DA058 - In-Place Upgrades - Kubernetes v2
+(https://docs.google.com/document/d/1tLjknwHudjcHs42nzPVBNkHs98XxAOT2BXGGpP7NyEU/)
+"""
+
+import abc
+import copy
+import enum
+import json
+import logging
+import pathlib
+import typing
+
+import ops
+import poetry.core.constraints.version as poetry_version
+from charms.mongodb.v0.mongodb import FailedToMovePrimaryError
+from tenacity import RetryError
+
+import status_exception
+
+logger = logging.getLogger(__name__)
+
+SHARD = "shard"
+PEER_RELATION_ENDPOINT_NAME = "upgrade-version-a"
+PRECHECK_ACTION_NAME = "pre-upgrade-check"
+RESUME_ACTION_NAME = "resume-upgrade"
+
+
+def unit_number(unit_: ops.Unit) -> int:
+    """Get unit number."""
+    return int(unit_.name.split("/")[-1])
+
+
+class PrecheckFailed(status_exception.StatusException):
+    """App is not ready to upgrade."""
+
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(
+            ops.BlockedStatus(
+                f"Rollback with `juju refresh`. Pre-upgrade check failed: {self.message}"
+            )
+        )
+
+
+class PeerRelationNotReady(Exception):
+    """Upgrade peer relation not available (to this unit)."""
+
+
+class UnitState(str, enum.Enum):
+    """Unit upgrade state."""
+
+    HEALTHY = "healthy"
+    RESTARTING = "restarting"  # Kubernetes only
+    UPGRADING = "upgrading"  # Machines only
+    OUTDATED = "outdated"  # Machines only
+
+
+class Upgrade(abc.ABC):
+    """In-place upgrades."""
+
+    def __init__(self, charm_: ops.CharmBase) -> None:
+        relations = charm_.model.relations[PEER_RELATION_ENDPOINT_NAME]
+        if not relations:
+            raise PeerRelationNotReady
+        assert len(relations) == 1
+        self._peer_relation = relations[0]
+        self._charm = charm_
+        self._unit: ops.Unit = charm_.unit
+        self._unit_databag = self._peer_relation.data[self._unit]
+        self._app_databag = self._peer_relation.data[charm_.app]
+        self._app_name = charm_.app.name
+        self._current_versions = {}  # For this unit
+        for version, file_name in {
+            "charm": "charm_version",
+            "workload": "workload_version",
+        }.items():
+            self._current_versions[version] = pathlib.Path(file_name).read_text().strip()
+
+    @property
+    def unit_state(self) -> typing.Optional[UnitState]:
+        """Unit upgrade state."""
+        if state := self._unit_databag.get("state"):
+            return UnitState(state)
+
+    @unit_state.setter
+    def unit_state(self, value: UnitState) -> None:
+        self._unit_databag["state"] = value.value
+
+    @property
+    def is_compatible(self) -> bool:
+        """Whether upgrade is supported from previous versions."""
+        assert self.versions_set
+        try:
+            previous_version_strs: typing.Dict[str, str] = json.loads(
+                self._app_databag["versions"]
+            )
+        except KeyError as exception:
+            logger.debug("`versions` missing from peer relation", exc_info=exception)
+            return False
+        # TODO charm versioning: remove `.split("+")` (which removes git hash before comparing)
+        previous_version_strs["charm"] = previous_version_strs["charm"].split("+")[0]
+        previous_versions: typing.Dict[str, poetry_version.Version] = {
+            key: poetry_version.Version.parse(value)
+            for key, value in previous_version_strs.items()
+        }
+        current_version_strs = copy.copy(self._current_versions)
+        current_version_strs["charm"] = current_version_strs["charm"].split("+")[0]
+        current_versions = {
+            key: poetry_version.Version.parse(value) for key, value in current_version_strs.items()
+        }
+        try:
+            # TODO Future PR: change this > sign to support downgrades
+            if (
+                previous_versions["charm"] > current_versions["charm"]
+                or previous_versions["charm"].major != current_versions["charm"].major
+            ):
+                logger.debug(
+                    f'{previous_versions["charm"]=} incompatible with {current_versions["charm"]=}'
+                )
+                return False
+            if (
+                previous_versions["workload"] > current_versions["workload"]
+                or previous_versions["workload"].major != current_versions["workload"].major
+            ):
+                logger.debug(
+                    f'{previous_versions["workload"]=} incompatible with {current_versions["workload"]=}'
+                )
+                return False
+            logger.debug(
+                f"Versions before upgrade compatible with versions after upgrade {previous_version_strs=} {self._current_versions=}"
+            )
+            return True
+        except KeyError as exception:
+            logger.debug(f"Version missing from {previous_versions=}", exc_info=exception)
+            return False
+
+    @property
+    def in_progress(self) -> bool:
+        """Whether upgrade is in progress."""
+        logger.debug(
+            f"{self._app_workload_container_version=} {self._unit_workload_container_versions=}"
+        )
+        return any(
+            version != self._app_workload_container_version
+            for version in self._unit_workload_container_versions.values()
+        )
+
+    @property
+    def _sorted_units(self) -> typing.List[ops.Unit]:
+        """Units sorted from highest to lowest unit number."""
+        return sorted((self._unit, *self._peer_relation.units), key=unit_number, reverse=True)
+
+    @abc.abstractmethod
+    def _get_unit_healthy_status(self) -> ops.StatusBase:
+        """Status shown during upgrade if unit is healthy."""
+
+    def get_unit_juju_status(self) -> typing.Optional[ops.StatusBase]:
+        """Unit upgrade status."""
+        if self.in_progress:
+            return self._get_unit_healthy_status()
+
+    @property
+    def app_status(self) -> typing.Optional[ops.StatusBase]:
+        """App upgrade status."""
+        if not self.in_progress:
+            return
+        if not self.upgrade_resumed:
+            # User confirmation needed to resume upgrade (i.e. upgrade second unit)
+            # Statuses over 120 characters are truncated in `juju status` as of juju 3.1.6 and
+            # 2.9.45
+            if len(self._sorted_units) > 1:
+                resume_string = (
+                    "Verify highest unit is healthy & run `{RESUME_ACTION_NAME}` action. "
+                )
+            return ops.BlockedStatus(
+                f"Upgrading. {resume_string}To rollback, `juju refresh` to last revision"
+            )
+        return ops.MaintenanceStatus(
+            "Upgrading. To rollback, `juju refresh` to the previous revision"
+        )
+
+    @property
+    def versions_set(self) -> bool:
+        """Whether versions have been saved in app databag.
+
+        Should only be `False` during first charm install.
+
+        If a user upgrades from a charm that does not set versions, this charm will get stuck.
+        """
+        return self._app_databag.get("versions") is not None
+
+    def set_versions_in_app_databag(self) -> None:
+        """Save current versions in app databag.
+
+        Used after next upgrade to check compatibility (i.e. whether that upgrade should be
+        allowed).
+        """
+        assert not self.in_progress
+        logger.debug(f"Setting {self._current_versions=} in upgrade peer relation app databag")
+        self._app_databag["versions"] = json.dumps(self._current_versions)
+        logger.debug(f"Set {self._current_versions=} in upgrade peer relation app databag")
+
+    @property
+    @abc.abstractmethod
+    def upgrade_resumed(self) -> bool:
+        """Whether user has resumed upgrade with Juju action."""
+
+    @property
+    @abc.abstractmethod
+    def _unit_workload_container_versions(self) -> typing.Dict[str, str]:
+        """{Unit name: unique identifier for unit's workload container version}.
+
+        If and only if this version changes, the workload will restart (during upgrade or
+        rollback).
+
+        On Kubernetes, the workload & charm are upgraded together
+        On machines, the charm is upgraded before the workload
+
+        This identifier should be comparable to `_app_workload_container_version` to determine if
+        the unit & app are the same workload container version.
+        """
+
+    @property
+    @abc.abstractmethod
+    def _app_workload_container_version(self) -> str:
+        """Unique identifier for the app's workload container version.
+
+        This should match the workload version in the current Juju app charm version.
+
+        This identifier should be comparable to `_unit_workload_container_versions` to determine if
+        the app & unit are the same workload container version.
+        """
+
+    @abc.abstractmethod
+    def reconcile_partition(self, *, action_event: ops.ActionEvent = None) -> None:
+        """If ready, allow next unit to upgrade."""
+
+    @property
+    @abc.abstractmethod
+    def authorized(self) -> bool:
+        """Whether this unit is authorized to upgrade.
+
+        Only applies to machine charm
+        """
+
+    @abc.abstractmethod
+    def upgrade_unit(self, *, charm) -> None:
+        """Upgrade this unit.
+
+        Only applies to machine charm
+        """
+
+    def pre_upgrade_check(self) -> None:
+        """Check if this app is ready to upgrade.
+
+        Runs before any units are upgraded
+
+        Does *not* run during rollback
+
+        On machines, this runs before any units are upgraded (after `juju refresh`)
+        On machines & Kubernetes, this also runs during pre-upgrade-check action
+
+        Can run on leader or non-leader unit
+
+        Raises:
+            PrecheckFailed: App is not ready to upgrade
+
+        TODO Kubernetes: Run (some) checks after `juju refresh` (in case user forgets to run
+        pre-upgrade-check action). Note: 1 unit will upgrade before we can run checks (checks may
+        need to be modified).
+        See https://chat.canonical.com/canonical/pl/cmf6uhm1rp8b7k8gkjkdsj4mya
+        """
+        logger.debug("Running pre-upgrade checks")
+
+        # TODO In future PR when we support upgrades on sharded clusters, have the shard verify
+        # that the config-server has already upgraded.
+
+        try:
+            self._charm.upgrade.wait_for_cluster_healthy()
+        except RetryError:
+            logger.error("Cluster is not healthy")
+            raise PrecheckFailed("Cluster is not healthy")
+
+        # On VM charms we can choose the order to upgrade, but not on K8s. In order to keep the
+        # two charms in sync we decided to have the VM charm have the same upgrade order as the K8s
+        # charm (i.e. highest to lowest.) Hence, we move the primary to the last unit to upgrade.
+        # This prevents the primary from jumping around from unit to unit during the upgrade
+        # procedure.
+        try:
+            self._charm.upgrade.move_primary_to_last_upgrade_unit()
+        except FailedToMovePrimaryError:
+            logger.error("Cluster failed to move primary before re-election.")
+            raise PrecheckFailed("Primary switchover failed")
+
+        if not self._charm.upgrade.is_cluster_able_to_read_write():
+            logger.error("Cluster cannot read/write to replicas")
+            raise PrecheckFailed("Cluster is not healthy")
diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py
index 4d3311f63..034365716 100644
--- a/tests/integration/backup_tests/test_backups.py
+++ b/tests/integration/backup_tests/test_backups.py
@@ -12,7 +12,7 @@
 from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed
 
 from ..ha_tests import helpers as ha_helpers
-from ..helpers import get_app_name
+from ..helpers import get_app_name, wait_for_mongodb_units_blocked
 from . import helpers
 
 S3_APP_NAME = "s3-integrator"
@@ -53,7 +53,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
         await ops_test.model.deploy(db_charm, num_units=3)
 
     # deploy the s3 integrator charm
-    await ops_test.model.deploy(S3_APP_NAME, channel="edge")
+    await ops_test.model.deploy(S3_APP_NAME, channel="edge", revision=17)
 
     await ops_test.model.wait_for_idle()
 
@@ -79,14 +79,11 @@ async def test_blocked_incorrect_creds(ops_test: OpsTest) -> None:
     )
 
     # verify that Charmed MongoDB is blocked and reports incorrect credentials
-    async with ops_test.fast_forward():
-        await asyncio.gather(
-            ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"),
-            ops_test.model.wait_for_idle(apps=[db_app_name], status="blocked", idle_period=20),
-        )
-        db_unit = ops_test.model.applications[db_app_name].units[0]
+    await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active")
 
-    assert db_unit.workload_status_message == "s3 credentials are incorrect."
+    await wait_for_mongodb_units_blocked(
+        ops_test, db_app_name, status="s3 credentials are incorrect.", timeout=300
+    )
 
 
 @pytest.mark.group(1)
@@ -99,14 +96,10 @@ async def test_blocked_incorrect_conf(ops_test: OpsTest, github_secrets) -> None
     await helpers.set_credentials(ops_test, github_secrets, cloud="AWS")
 
     # wait for both applications to be idle with the correct statuses
-    async with ops_test.fast_forward():
-        await asyncio.gather(
-            ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"),
-            ops_test.model.wait_for_idle(apps=[db_app_name], status="blocked", idle_period=20),
-        )
-
-    db_unit = ops_test.model.applications[db_app_name].units[0]
-    assert db_unit.workload_status_message == "s3 configurations are incompatible."
+    ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active")
+    await wait_for_mongodb_units_blocked(
+        ops_test, db_app_name, status="s3 configurations are incompatible.", timeout=300
+    )
 
 
 @pytest.mark.group(1)
@@ -178,9 +171,7 @@ async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db, github_s
     db_unit = await helpers.get_leader_unit(ops_test)
 
     # create first backup once ready
-    await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
-    )
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15)
 
     action = await db_unit.run_action(action_name="create-backup")
     first_backup = await action.wait()
@@ -198,9 +189,7 @@ async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db, github_s
     }
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
 
-    await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
-    )
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15)
 
     # create a backup as soon as possible. might not be immediately possible since only one backup
     # can happen at a time.
@@ -217,9 +206,7 @@ async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db, github_s
     # backup can take a lot of time so this function returns once the command was successfully
     # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the
     # backup
-    await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
-    )
+    await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15)
 
     # verify that backups was made in GCP bucket
     try:
@@ -239,7 +226,7 @@ async def test_multi_backup(ops_test: OpsTest, continuous_writes_to_db, github_s
     }
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15),
     )
 
     # verify that backups was made on the AWS bucket
@@ -294,7 +281,7 @@ async def test_restore(ops_test: OpsTest, add_writes_to_db) -> None:
     assert restore.results["restore-status"] == "restore started", "restore not successful"
 
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15),
     )
 
     # verify all writes are present
@@ -331,7 +318,7 @@ async def test_restore_new_cluster(
     await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
     await asyncio.gather(
         ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active"),
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15),
     )
 
     # create a backup
@@ -346,7 +333,7 @@ async def test_restore_new_cluster(
     db_charm = await ops_test.build_charm(".")
     await ops_test.model.deploy(db_charm, num_units=3, application_name=NEW_CLUSTER)
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=15),
     )
 
     db_unit = await helpers.get_leader_unit(ops_test, db_app_name=NEW_CLUSTER)
@@ -363,7 +350,7 @@ async def test_restore_new_cluster(
 
     # wait for new cluster to sync
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[NEW_CLUSTER], status="active", idle_period=15),
     )
 
     # verify that the listed backups from the old cluster are not listed as failed.
@@ -406,7 +393,7 @@ async def test_update_backup_password(ops_test: OpsTest) -> None:
 
     # wait for charm to be idle before setting password
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15),
     )
 
     parameters = {"username": "backup"}
@@ -416,7 +403,7 @@ async def test_update_backup_password(ops_test: OpsTest) -> None:
 
     # wait for charm to be idle after setting password
     await asyncio.gather(
-        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
+        ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=15),
     )
 
     # verify we still have connection to pbm via creating a backup
diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index dca31c170..9dd7895bb 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -5,13 +5,22 @@
 import logging
 import subprocess
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import ops
 import yaml
+from dateutil.parser import parse
 from pymongo import MongoClient
 from pytest_operator.plugin import OpsTest
-from tenacity import retry, retry_if_result, stop_after_attempt, wait_exponential
+from tenacity import (
+    Retrying,
+    retry,
+    retry_if_result,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential,
+    wait_fixed,
+)
 
 METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
 APP_NAME = METADATA["name"]
@@ -24,6 +33,48 @@
 logger = logging.getLogger(__name__)
 
 
+class Status:
+    """Model class for status."""
+
+    def __init__(self, value: str, since: str, message: Optional[str] = None):
+        self.value = value
+        self.since = parse(since, ignoretz=True)
+        self.message = message
+
+
+class Unit:
+    """Model class for a Unit, with properties widely used."""
+
+    def __init__(
+        self,
+        id: int,
+        name: str,
+        ip: str,
+        hostname: str,
+        is_leader: bool,
+        machine_id: int,
+        workload_status: Status,
+        agent_status: Status,
+        app_status: Status,
+    ):
+        self.id = id
+        self.name = name
+        self.ip = ip
+        self.hostname = hostname
+        self.is_leader = is_leader
+        self.machine_id = machine_id
+        self.workload_status = workload_status
+        self.agent_status = agent_status
+        self.app_status = app_status
+
+    def dump(self) -> Dict[str, Any]:
+        """To json."""
+        result = {}
+        for key, val in vars(self).items():
+            result[key] = vars(val) if isinstance(val, Status) else val
+        return result
+
+
 def unit_uri(ip_address: str, password, app=APP_NAME) -> str:
     """Generates URI that is used by MongoDB to connect to a single replica.
 
@@ -303,3 +354,86 @@ def audit_log_line_sanity_check(entry) -> bool:
             logger.error("Field '%s' not found in audit log entry \"%s\"", field, entry)
             return False
     return True
+
+
+async def get_unit_hostname(ops_test: OpsTest, unit_id: int, app: str) -> str:
+    """Get the hostname of a specific unit."""
+    _, hostname, _ = await ops_test.juju("ssh", f"{app}/{unit_id}", "hostname")
+    return hostname.strip()
+
+
+def get_raw_application(ops_test: OpsTest, app: str) -> Dict[str, Any]:
+    """Get raw application details."""
+    return json.loads(
+        subprocess.check_output(
+            f"juju status --model {ops_test.model.info.name} {app} --format=json".split()
+        )
+    )["applications"][app]
+
+
+async def get_application_units(ops_test: OpsTest, app: str) -> List[Unit]:
+    """Get fully detailed units of an application."""
+    # Juju incorrectly reports the IP addresses after the network is restored this is reported as a
+    # bug here: https://github.com/juju/python-libjuju/issues/738. Once this bug is resolved use of
+    # `get_unit_ip` should be replaced with `.public_address`
+    raw_app = get_raw_application(ops_test, app)
+    units = []
+    for u_name, unit in raw_app["units"].items():
+        unit_id = int(u_name.split("/")[-1])
+
+        if not unit.get("public-address"):
+            # unit not ready yet...
+            continue
+
+        unit = Unit(
+            id=unit_id,
+            name=u_name.replace("/", "-"),
+            ip=unit["public-address"],
+            hostname=await get_unit_hostname(ops_test, unit_id, app),
+            is_leader=unit.get("leader", False),
+            machine_id=int(unit["machine"]),
+            workload_status=Status(
+                value=unit["workload-status"]["current"],
+                since=unit["workload-status"]["since"],
+                message=unit["workload-status"].get("message"),
+            ),
+            agent_status=Status(
+                value=unit["juju-status"]["current"],
+                since=unit["juju-status"]["since"],
+            ),
+            app_status=Status(
+                value=raw_app["application-status"]["current"],
+                since=raw_app["application-status"]["since"],
+                message=raw_app["application-status"].get("message"),
+            ),
+        )
+
+        units.append(unit)
+
+    return units
+
+
+async def check_all_units_blocked_with_status(
+    ops_test: OpsTest, db_app_name: str, status: Optional[str]
+) -> None:
+    # this is necessary because ops_model.units does not update the unit statuses
+    for unit in await get_application_units(ops_test, db_app_name):
+        assert (
+            unit.workload_status.value == "blocked"
+        ), f"unit {unit.name} not in blocked state, in {unit.workload_status}"
+        if status:
+            assert (
+                unit.workload_status.message == status
+            ), f"unit {unit.name} not in blocked state, in {unit.workload_status}"
+
+
+async def wait_for_mongodb_units_blocked(
+    ops_test: OpsTest, db_app_name: str, status: Optional[str] = None, timeout=20
+) -> None:
+    """Waits for units of MongoDB to be in the blocked state.
+
+    This is necessary because the MongoDB app can report a different status than the units.
+    """
+    for attempt in Retrying(stop=stop_after_delay(timeout), wait=wait_fixed(1), reraise=True):
+        with attempt:
+            await check_all_units_blocked_with_status(ops_test, db_app_name, status)
diff --git a/tests/integration/sharding_tests/test_sharding.py b/tests/integration/sharding_tests/test_sharding.py
index 6f1003ad3..fe1414204 100644
--- a/tests/integration/sharding_tests/test_sharding.py
+++ b/tests/integration/sharding_tests/test_sharding.py
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 # Copyright 2023 Canonical Ltd.
 # See LICENSE file for licensing details.
-import asyncio
-
 import pytest
 from pytest_operator.plugin import OpsTest
 
-from ..helpers import get_leader_id, get_password, set_password
+from ..helpers import (
+    get_leader_id,
+    get_password,
+    set_password,
+    wait_for_mongodb_units_blocked,
+)
 from .helpers import (
     generate_mongodb_client,
     has_correct_shards,
@@ -59,7 +62,12 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
     )
 
     await ops_test.model.wait_for_idle(
-        apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_THREE_APP_NAME],
+        apps=[
+            CONFIG_SERVER_APP_NAME,
+            SHARD_ONE_APP_NAME,
+            SHARD_TWO_APP_NAME,
+            SHARD_THREE_APP_NAME,
+        ],
         idle_period=20,
         raise_on_blocked=False,
         timeout=TIMEOUT,
@@ -67,27 +75,10 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
     )
 
     # verify that Charmed MongoDB is blocked and reports incorrect credentials
-    await asyncio.gather(
-        ops_test.model.wait_for_idle(
-            apps=[CONFIG_SERVER_APP_NAME],
-            status="blocked",
-            idle_period=20,
-            timeout=TIMEOUT,
-        ),
-        ops_test.model.wait_for_idle(
-            apps=[SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME],
-            status="blocked",
-            idle_period=20,
-            timeout=TIMEOUT,
-        ),
-    )
-
-    config_server_unit = ops_test.model.applications[CONFIG_SERVER_APP_NAME].units[0]
-    assert config_server_unit.workload_status_message == "missing relation to shard(s)"
-
-    for shard_app_name in SHARD_APPS:
-        shard_unit = ops_test.model.applications[shard_app_name].units[0]
-        assert shard_unit.workload_status_message == "missing relation to config server"
+    await wait_for_mongodb_units_blocked(ops_test, CONFIG_SERVER_APP_NAME, timeout=300)
+    await wait_for_mongodb_units_blocked(ops_test, SHARD_ONE_APP_NAME, timeout=300)
+    await wait_for_mongodb_units_blocked(ops_test, SHARD_TWO_APP_NAME, timeout=300)
+    await wait_for_mongodb_units_blocked(ops_test, SHARD_THREE_APP_NAME, timeout=300)
 
 
 @pytest.mark.group(1)
@@ -114,7 +105,7 @@ async def test_cluster_active(ops_test: OpsTest) -> None:
             SHARD_TWO_APP_NAME,
             SHARD_THREE_APP_NAME,
         ],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -150,7 +141,7 @@ async def test_set_operator_password(ops_test: OpsTest):
     await ops_test.model.wait_for_idle(
         apps=CLUSTER_APPS,
         status="active",
-        idle_period=20,
+        idle_period=15,
     ),
 
     for cluster_app_name in CLUSTER_APPS:
@@ -250,7 +241,7 @@ async def test_shard_removal(ops_test: OpsTest) -> None:
             SHARD_TWO_APP_NAME,
             SHARD_THREE_APP_NAME,
         ],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -291,7 +282,7 @@ async def test_removal_of_non_primary_shard(ops_test: OpsTest):
             SHARD_TWO_APP_NAME,
             SHARD_THREE_APP_NAME,
         ],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -304,7 +295,7 @@ async def test_removal_of_non_primary_shard(ops_test: OpsTest):
 
     await ops_test.model.wait_for_idle(
         apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -342,7 +333,7 @@ async def test_unconventual_shard_removal(ops_test: OpsTest):
 
     await ops_test.model.wait_for_idle(
         apps=[SHARD_TWO_APP_NAME],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -351,7 +342,7 @@ async def test_unconventual_shard_removal(ops_test: OpsTest):
     await ops_test.model.applications[SHARD_TWO_APP_NAME].destroy_units(f"{SHARD_TWO_APP_NAME}/0")
     await ops_test.model.wait_for_idle(
         apps=[SHARD_TWO_APP_NAME],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
@@ -361,7 +352,7 @@ async def test_unconventual_shard_removal(ops_test: OpsTest):
 
     await ops_test.model.wait_for_idle(
         apps=[CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME],
-        idle_period=20,
+        idle_period=15,
         status="active",
         timeout=TIMEOUT,
         raise_on_error=False,
diff --git a/tests/integration/sharding_tests/test_sharding_relations.py b/tests/integration/sharding_tests/test_sharding_relations.py
index d4f9c9661..9b3925ca3 100644
--- a/tests/integration/sharding_tests/test_sharding_relations.py
+++ b/tests/integration/sharding_tests/test_sharding_relations.py
@@ -5,6 +5,8 @@
 from juju.errors import JujuAPIError
 from pytest_operator.plugin import OpsTest
 
+from ..helpers import wait_for_mongodb_units_blocked
+
 S3_APP_NAME = "s3-integrator"
 SHARD_ONE_APP_NAME = "shard"
 CONFIG_SERVER_ONE_APP_NAME = "config-server-one"
@@ -134,20 +136,13 @@ async def test_cannot_use_db_relation(ops_test: OpsTest) -> None:
     for sharded_component in SHARDING_COMPONENTS:
         await ops_test.model.integrate(f"{APP_CHARM_NAME}:{DATABASE_REL_NAME}", sharded_component)
 
-    await ops_test.model.wait_for_idle(
-        apps=SHARDING_COMPONENTS,
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
-    )
-
     for sharded_component in SHARDING_COMPONENTS:
-        sharded_component_unit = ops_test.model.applications[sharded_component].units[0]
-        assert (
-            sharded_component_unit.workload_status_message
-            == "Sharding roles do not support database interface."
-        ), f"{sharded_component} cannot be related using the database relation"
+        await wait_for_mongodb_units_blocked(
+            ops_test,
+            sharded_component,
+            status="Sharding roles do not support database interface.",
+            timeout=300,
+        )
 
     # clean up relations
     for sharded_component in SHARDING_COMPONENTS:
@@ -171,20 +166,13 @@ async def test_cannot_use_legacy_db_relation(ops_test: OpsTest) -> None:
     for sharded_component in SHARDING_COMPONENTS:
         await ops_test.model.integrate(LEGACY_APP_CHARM_NAME, sharded_component)
 
-    await ops_test.model.wait_for_idle(
-        apps=SHARDING_COMPONENTS,
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
-    )
-
     for sharded_component in SHARDING_COMPONENTS:
-        sharded_component_unit = ops_test.model.applications[sharded_component].units[0]
-        assert (
-            sharded_component_unit.workload_status_message
-            == "Sharding roles do not support obsolete interface."
-        ), f"{sharded_component} cannot be related using the mongodb relation"
+        await wait_for_mongodb_units_blocked(
+            ops_test,
+            sharded_component,
+            status="Sharding roles do not support obsolete interface.",
+            timeout=300,
+        )
 
     # clean up relations
     for sharded_component in SHARDING_COMPONENTS:
@@ -211,19 +199,13 @@ async def test_replication_config_server_relation(ops_test: OpsTest):
         f"{CONFIG_SERVER_ONE_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[REPLICATION_APP_NAME],
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        REPLICATION_APP_NAME,
+        status="sharding interface cannot be used by replicas",
+        timeout=300,
     )
 
-    replication_unit = ops_test.model.applications[REPLICATION_APP_NAME].units[0]
-    assert (
-        replication_unit.workload_status_message == "sharding interface cannot be used by replicas"
-    ), "replication cannot be related to config server."
-
     # clean up relations
     await ops_test.model.applications[REPLICATION_APP_NAME].remove_relation(
         f"{REPLICATION_APP_NAME}:{SHARD_REL_NAME}",
@@ -241,19 +223,13 @@ async def test_replication_shard_relation(ops_test: OpsTest):
         f"{REPLICATION_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[REPLICATION_APP_NAME],
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        REPLICATION_APP_NAME,
+        status="sharding interface cannot be used by replicas",
+        timeout=300,
     )
 
-    replication_unit = ops_test.model.applications[REPLICATION_APP_NAME].units[0]
-    assert (
-        replication_unit.workload_status_message == "sharding interface cannot be used by replicas"
-    ), "replication cannot be related to config server."
-
     # clean up relation
     await ops_test.model.applications[REPLICATION_APP_NAME].remove_relation(
         f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}",
@@ -278,20 +254,13 @@ async def test_replication_mongos_relation(ops_test: OpsTest) -> None:
         f"{MONGOS_APP_NAME}",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[REPLICATION_APP_NAME],
-        idle_period=20,
-        status="blocked",
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        REPLICATION_APP_NAME,
+        status="Relation to mongos not supported, config role must be config-server",
+        timeout=300,
     )
 
-    replication_unit = ops_test.model.applications[REPLICATION_APP_NAME].units[0]
-    assert (
-        replication_unit.workload_status_message
-        == "Relation to mongos not supported, config role must be config-server"
-    ), "replica cannot be related to mongos."
-
     # clean up relations
     await ops_test.model.applications[REPLICATION_APP_NAME].remove_relation(
         f"{REPLICATION_APP_NAME}:cluster",
@@ -316,34 +285,19 @@ async def test_shard_mongos_relation(ops_test: OpsTest) -> None:
         f"{MONGOS_APP_NAME}",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[SHARD_ONE_APP_NAME],
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        SHARD_ONE_APP_NAME,
+        status="Relation to mongos not supported, config role must be config-server",
+        timeout=300,
     )
 
-    shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0]
-    assert (
-        shard_unit.workload_status_message
-        == "Relation to mongos not supported, config role must be config-server"
-    ), "replica cannot be related to mongos."
-
     # clean up relations
     await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation(
         f"{MONGOS_APP_NAME}:cluster",
         f"{SHARD_ONE_APP_NAME}:cluster",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[SHARD_ONE_APP_NAME],
-        status="blocked",
-        idle_period=20,
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
-    )
-
 
 @pytest.mark.group(1)
 @pytest.mark.abort_on_fail
@@ -355,20 +309,13 @@ async def test_shard_s3_relation(ops_test: OpsTest) -> None:
         f"{S3_APP_NAME}",
     )
 
-    await ops_test.model.wait_for_idle(
-        apps=[SHARD_ONE_APP_NAME],
-        idle_period=20,
-        status="blocked",
-        raise_on_blocked=False,
-        timeout=TIMEOUT,
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        SHARD_ONE_APP_NAME,
+        status="Relation to s3-integrator is not supported, config role must be config-server",
+        timeout=300,
     )
 
-    shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0]
-    assert (
-        shard_unit.workload_status_message
-        == "Relation to s3-integrator is not supported, config role must be config-server"
-    ), "Shard cannot be related to s3-integrator."
-
     # clean up relations
     await ops_test.model.applications[SHARD_ONE_APP_NAME].remove_relation(
         f"{S3_APP_NAME}:s3-credentials",
diff --git a/tests/integration/sharding_tests/test_sharding_tls.py b/tests/integration/sharding_tests/test_sharding_tls.py
index 5bd243a1f..c76a52428 100644
--- a/tests/integration/sharding_tests/test_sharding_tls.py
+++ b/tests/integration/sharding_tests/test_sharding_tls.py
@@ -7,6 +7,7 @@
 from pytest_operator.plugin import OpsTest
 from tenacity import Retrying, stop_after_attempt, wait_fixed
 
+from ..helpers import wait_for_mongodb_units_blocked
 from ..tls_tests import helpers as tls_helpers
 
 MONGOD_SERVICE = "snap.charmed-mongodb.mongod.service"
@@ -124,15 +125,9 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
         raise_on_blocked=False,
     )
 
-    async with ops_test.fast_forward():
-        await ops_test.model.wait_for_idle(
-            apps=[SHARD_ONE_APP_NAME], status="blocked", timeout=1000
-        )
-
-    shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0]
-    assert (
-        shard_unit.workload_status_message == "Shard requires TLS to be enabled."
-    ), "Shard fails to report TLS inconsistencies."
+    await wait_for_mongodb_units_blocked(
+        ops_test, SHARD_ONE_APP_NAME, status="Shard requires TLS to be enabled.", timeout=300
+    )
 
     # Re-integrate to bring cluster back to steady state
     await ops_test.model.integrate(
@@ -160,15 +155,12 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
         timeout=TIMEOUT,
         raise_on_blocked=False,
     )
-    async with ops_test.fast_forward():
-        await ops_test.model.wait_for_idle(
-            apps=[SHARD_ONE_APP_NAME], status="blocked", timeout=1000
-        )
-
-    shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0]
-    assert (
-        shard_unit.workload_status_message == "Shard has TLS enabled, but config-server does not."
-    ), "Shard fails to report TLS inconsistencies."
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        SHARD_ONE_APP_NAME,
+        status="Shard has TLS enabled, but config-server does not.",
+        timeout=300,
+    )
 
     # CASE 3: Cluster components are using different CA's
 
@@ -184,15 +176,12 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
         timeout=TIMEOUT,
         raise_on_blocked=False,
     )
-    async with ops_test.fast_forward():
-        await ops_test.model.wait_for_idle(
-            apps=[SHARD_ONE_APP_NAME], status="blocked", timeout=1000
-        )
-
-    shard_unit = ops_test.model.applications[SHARD_ONE_APP_NAME].units[0]
-    assert (
-        shard_unit.workload_status_message == "Shard CA and Config-Server CA don't match."
-    ), "Shard fails to report TLS inconsistencies."
+    await wait_for_mongodb_units_blocked(
+        ops_test,
+        SHARD_ONE_APP_NAME,
+        status="Shard CA and Config-Server CA don't match.",
+        timeout=300,
+    )
 
 
 async def check_cluster_tls_disabled(ops_test: OpsTest) -> None:
diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py
index 26803265e..457728a81 100644
--- a/tests/integration/test_charm.py
+++ b/tests/integration/test_charm.py
@@ -64,6 +64,7 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
     await ops_test.model.wait_for_idle()
 
 
+@pytest.mark.skip("Skip this test until upgrade are implemented in the new way")
 @pytest.mark.group(1)
 async def test_consistency_between_workload_and_metadata(ops_test: OpsTest):
     """Verifies that the dependencies in the charm version are accurate."""
@@ -75,6 +76,8 @@ async def test_consistency_between_workload_and_metadata(ops_test: OpsTest):
     # version has format x.y.z-a
     mongod_version = client.server_info()["version"].split("-")[0]
 
+    # Future PR - change the dependency check to check the file for workload and charm version
+    # instead
     assert (
         mongod_version == Config.DEPENDENCIES["mongod_service"]["version"]
     ), f"Version of mongod running does not match dependency matrix, update DEPENDENCIES in src/config.py to {mongod_version}"
diff --git a/tests/integration/upgrade/test_upgrade.py b/tests/integration/upgrade/test_upgrade.py
index 67d2c5af8..c441487ea 100644
--- a/tests/integration/upgrade/test_upgrade.py
+++ b/tests/integration/upgrade/test_upgrade.py
@@ -34,14 +34,17 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
         await check_or_scale_app(ops_test, app_name, required_units=3)
         return
 
-    # TODO: When `6/stable` track supports upgrades deploy and test that revision instead.
-    await ops_test.model.deploy("mongodb", channel="edge", num_units=3)
+    # TODO: When upgrades are supported, deploy with most recent revision (6/stable when possible,
+    # but 6/edge as soon as available)
+    charm = await ops_test.build_charm(".")
+    await ops_test.model.deploy(charm, channel="edge", num_units=3)
 
     await ops_test.model.wait_for_idle(
         apps=["mongodb"], status="active", timeout=1000, idle_period=120
     )
 
 
+@pytest.mark.skip("re-enable these tests once upgrades are functioning")
 @pytest.mark.group(1)
 async def test_upgrade(ops_test: OpsTest, continuous_writes) -> None:
     """Verifies that the upgrade can run successfully."""
diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py
index 23d16553f..5c2f3f6cf 100644
--- a/tests/unit/test_charm.py
+++ b/tests/unit/test_charm.py
@@ -47,6 +47,7 @@ def setUp(self, *unused):
         self.addCleanup(self.harness.cleanup)
         self.harness.begin()
         self.peer_rel_id = self.harness.add_relation("database-peers", "database-peers")
+        self.peer_rel_id = self.harness.add_relation("upgrade-version-a", "upgrade-version-a")
 
     @pytest.fixture
     def use_caplog(self, caplog):
@@ -198,7 +199,7 @@ def test_unit_ips(self):
         self.harness.add_relation_unit(rel_id, "mongodb/1")
         self.harness.update_relation_data(rel_id, "mongodb/1", PEER_ADDR)
 
-        resulting_ips = self.harness.charm._unit_ips
+        resulting_ips = self.harness.charm.unit_ips
         expected_ips = ["127.4.5.6", "1.1.1.1"]
         self.assertEqual(resulting_ips, expected_ips)
 
@@ -625,16 +626,16 @@ def test_storage_detaching_failure_does_not_defer(self, retry_stop, connection):
 
     @patch("charm.MongodbOperatorCharm.get_secret")
     @patch_network_get(private_address="1.1.1.1")
-    @patch("charm.MongodbOperatorCharm._unit_ips")
+    @patch("charm.MongodbOperatorCharm.unit_ips")
     @patch("charm.MongoDBConnection")
-    def test_process_unremoved_units_handles_errors(self, connection, _unit_ips, get_secret):
+    def test_process_unremoved_units_handles_errors(self, connection, unit_ips, get_secret):
         """Test failures in process_unremoved_units are handled and not raised."""
         get_secret.return_value = "pass123"
         connection.return_value.__enter__.return_value.get_replset_members.return_value = {
             "1.1.1.1",
             "2.2.2.2",
         }
-        self.harness.charm._unit_ips = ["2.2.2.2"]
+        self.harness.charm.unit_ips = ["2.2.2.2"]
 
         for exception in [PYMONGO_EXCEPTIONS, NotReadyError]:
             connection.return_value.__enter__.return_value.remove_replset_member.side_effect = (
diff --git a/tests/unit/test_upgrade.py b/tests/unit/test_upgrade.py
index acebe6bec..c2af96a13 100644
--- a/tests/unit/test_upgrade.py
+++ b/tests/unit/test_upgrade.py
@@ -2,11 +2,9 @@
 # See LICENSE file for licensing details.
 import unittest
 from unittest import mock
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
-from charms.data_platform_libs.v0.upgrade import ClusterNotReadyError
-from charms.operator_libs_linux.v2 import snap
-from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus
+from ops.model import ActiveStatus, BlockedStatus
 from ops.testing import Harness
 
 from charm import MongodbOperatorCharm
@@ -20,11 +18,12 @@ def setUp(self, *unused):
         self.addCleanup(self.harness.cleanup)
         self.harness.begin()
         self.peer_rel_id = self.harness.add_relation("database-peers", "database-peers")
-        self.peer_rel_id = self.harness.add_relation("upgrade", "upgrade")
+        self.peer_rel_id = self.harness.add_relation("upgrade-version-a", "upgrade-version-a")
 
     @patch_network_get(private_address="1.1.1.1")
-    @patch("events.upgrade.MongoDBConnection")
-    def test_is_cluster_healthy(self, connection):
+    @patch("charms.mongodb.v1.helpers.MongoDBConnection")
+    @patch("upgrades.mongodb_upgrade.MongoDBConnection")
+    def test_is_cluster_healthy(self, connection, connection_ready):
         """Test is_cluster_healthy function."""
 
         def is_shard_mock_call(*args):
@@ -43,90 +42,35 @@ def is_replication_mock_call(*args):
         self.harness.charm.is_role = is_shard_mock_call
         assert not self.harness.charm.upgrade.is_cluster_healthy()
 
-        # case 2: cluster is still syncing
+        # case 2: unit is not ready after restarting
+        connection_ready.return_value.__enter__.return_value.is_ready = False
+        assert not self.harness.charm.upgrade.is_cluster_healthy()
+
+        # case 3: cluster is still syncing
+        connection_ready.return_value.__enter__.return_value.is_ready = True
         self.harness.charm.is_role = is_replication_mock_call
         self.harness.charm.process_statuses = active_status
         connection.return_value.__enter__.return_value.is_any_sync.return_value = True
         assert not self.harness.charm.upgrade.is_cluster_healthy()
 
-        # case 3: unit is not active
+        # case 4: unit is not active
         self.harness.charm.process_statuses = blocked_status
         connection.return_value.__enter__.return_value.is_any_sync.return_value = False
         assert not self.harness.charm.upgrade.is_cluster_healthy()
 
-        # # case 4: cluster is helathy
+        # case 5: cluster is helathy
         self.harness.charm.process_statuses = active_status
         assert self.harness.charm.upgrade.is_cluster_healthy()
 
     @patch_network_get(private_address="1.1.1.1")
-    @patch("events.upgrade.MongoDBConnection")
-    @patch("charm.MongoDBUpgrade.is_excepted_write_on_replica")
-    def test_is_replica_set_able_read_write(self, is_excepted_write_on_replica, connection):
+    @patch("upgrades.mongodb_upgrade.MongoDBConnection")
+    @patch("charm.MongoDBUpgrade.is_write_on_secondaries")
+    def test_is_replica_set_able_read_write(self, is_write_on_secondaries, connection):
         """Test test_is_replica_set_able_read_write function."""
         # case 1: writes are not present on secondaries
-        is_excepted_write_on_replica.return_value = False
+        is_write_on_secondaries.return_value = False
         assert not self.harness.charm.upgrade.is_replica_set_able_read_write()
 
         # case 2: writes are present on secondaries
-        is_excepted_write_on_replica.return_value = True
+        is_write_on_secondaries.return_value = True
         assert self.harness.charm.upgrade.is_replica_set_able_read_write()
-
-    @patch_network_get(private_address="1.1.1.1")
-    @patch("charm.MongoDBConnection")
-    def test_build_upgrade_stack(self, connection):
-        """Tests that build upgrade stack puts the primary unit at the bottom of the stack."""
-        rel_id = self.harness.charm.model.get_relation("database-peers").id
-        self.harness.add_relation_unit(rel_id, "mongodb/1")
-        connection.return_value.__enter__.return_value.primary.return_value = "1.1.1.1"
-        assert self.harness.charm.upgrade.build_upgrade_stack() == [0, 1]
-
-    @patch_network_get(private_address="1.1.1.1")
-    @patch("events.upgrade.Retrying")
-    @patch("charm.MongoDBUpgrade.is_excepted_write_on_replica")
-    @patch("charm.MongodbOperatorCharm.restart_charm_services")
-    @patch("charm.MongoDBConnection")
-    @patch("events.upgrade.MongoDBConnection")
-    @patch("charm.MongodbOperatorCharm.install_snap_packages")
-    @patch("charm.MongodbOperatorCharm.stop_charm_services")
-    @patch("charm.MongoDBUpgrade.post_upgrade_check")
-    def test_on_upgrade_granted(
-        self,
-        post_upgrade_check,
-        stop_charm_services,
-        install_snap_packages,
-        connection_1,
-        connection_2,
-        restart,
-        is_excepted_write_on_replica,
-        retrying,
-    ):
-        # upgrades need a peer relation to proceed
-        rel_id = self.harness.charm.model.get_relation("database-peers").id
-        self.harness.add_relation_unit(rel_id, "mongodb/1")
-
-        # case 1: fails to install snap_packages
-        install_snap_packages.side_effect = snap.SnapError
-        mock_event = MagicMock()
-        self.harness.charm.upgrade._on_upgrade_granted(mock_event)
-        restart.assert_not_called()
-
-        # case 2: post_upgrade_check fails
-        install_snap_packages.side_effect = None
-        # disable_retry
-        post_upgrade_check.side_effect = ClusterNotReadyError(
-            "post-upgrade check failed and cannot safely upgrade",
-            cause="Cluster cannot read/write",
-        )
-        mock_event = MagicMock()
-        self.harness.charm.upgrade._on_upgrade_granted(mock_event)
-        restart.assert_called()
-        self.assertTrue(isinstance(self.harness.charm.unit.status, BlockedStatus))
-
-        # case 3: everything works
-        install_snap_packages.side_effect = None
-        is_excepted_write_on_replica.return_value = True
-        post_upgrade_check.side_effect = None
-        mock_event = MagicMock()
-        self.harness.charm.upgrade._on_upgrade_granted(mock_event)
-        restart.assert_called()
-        self.assertTrue(isinstance(self.harness.charm.unit.status, MaintenanceStatus))
diff --git a/tox.ini b/tox.ini
index 0d0355ece..29b406a09 100644
--- a/tox.ini
+++ b/tox.ini
@@ -22,6 +22,32 @@ pass_env =
     CHARM_BUILD_DIR
     MODEL_SETTINGS
 
+[testenv:build-{production,dev,wrapper}]
+# Wrap `charmcraft pack`, TODO support poetry on this charm, this is a non-supported usage of the
+# build wrapper.
+pass_env =
+    CI
+allowlist_externals =
+    {[testenv]allowlist_externals}
+    charmcraft
+    charmcraftcache
+    mv
+commands_pre =
+    # TODO charm versioning: Remove
+    # Workaround to add unique identifier (git hash) to charm version while specification
+    # DA053 - Charm versioning
+    # (https://docs.google.com/document/d/1Jv1jhWLl8ejK3iJn7Q3VbCIM9GIhp8926bgXpdtx-Sg/edit?pli=1)
+    # is pending review.
+    python -c 'import pathlib; import shutil; import subprocess; git_hash=subprocess.run(["git", "describe", "--always", "--dirty"], capture_output=True, check=True, encoding="utf-8").stdout; file = pathlib.Path("charm_version"); shutil.copy(file, pathlib.Path("charm_version.backup")); version = file.read_text().strip(); file.write_text(f"{version}+{git_hash}")'
+    # TODO comment this back in when charm supports poetry. poetry export --only main,charm-libs --output requirements.txt
+
+commands =
+    build-production: charmcraft pack {posargs}
+    build-dev: charmcraftcache pack {posargs}
+commands_post =
+    # TODO comment this back in when charm supports poetry. mv requirements.txt requirements-last-build.txt
+    mv charm_version.backup charm_version
+
 [testenv:format]
 description = Apply coding style standards to code
 deps =
@@ -82,7 +108,7 @@ deps =
     pytest
     juju==3.4.0.0
     pytest-mock
-    pytest-operator
+    pytest-operator ==0.34.0
     git+https://github.com/canonical/data-platform-workflows@v13.1.1\#subdirectory=python/pytest_plugins/github_secrets
     git+https://github.com/canonical/data-platform-workflows@v13.1.1\#subdirectory=python/pytest_plugins/pytest_operator_groups
     git+https://github.com/canonical/data-platform-workflows@v13.1.1\#subdirectory=python/pytest_plugins/pytest_operator_cache
diff --git a/workload_version b/workload_version
new file mode 100644
index 000000000..b7ff1516c
--- /dev/null
+++ b/workload_version
@@ -0,0 +1 @@
+6.0.6