Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1293,15 +1293,20 @@ def _on_stop(self, _):
f"failed to patch k8s {type(resource).__name__} {resource.metadata.name}"
)

def _on_update_status(self, _) -> None:
"""Update the unit status message."""
def _on_update_status_early_exit_checks(self, container) -> bool:
if not self.upgrade.idle:
logger.debug("Early exit on_update_status: upgrade in progress")
return
return False

container = self.unit.get_container("postgresql")
if not container.can_connect():
logger.debug("on_update_status early exit: Cannot connect to container")
return False
return True

def _on_update_status(self, _) -> None:
"""Update the unit status message."""
container = self.unit.get_container("postgresql")
if not self._on_update_status_early_exit_checks(container):
return

if self._has_blocked_status or self._has_non_restore_waiting_status:
Expand All @@ -1325,7 +1330,10 @@ def _on_update_status(self, _) -> None:
logger.warning(
"%s pebble service inactive, restarting service" % self._postgresql_service
)
container.restart(self._postgresql_service)
try:
container.restart(self._postgresql_service)
except ChangeError:
logger.exception("Failed to restart patroni")
Comment on lines +1333 to +1336
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's a dangling Postgresql process, Patroni will be unable to start

# If service doesn't recover fast, exit and wait for next hook run to re-check
if not self._patroni.member_started:
self.unit.status = MaintenanceStatus("Database service inactive, restarting")
Expand Down
21 changes: 17 additions & 4 deletions tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,22 +100,30 @@ def get_patroni_cluster(unit_ip: str) -> Dict[str, str]:
return resp.json()


async def change_patroni_setting(ops_test: OpsTest, setting: str, value: int) -> None:
async def change_patroni_setting(
ops_test: OpsTest, setting: str, value: Union[int, str], tls: bool = False
) -> None:
"""Change the value of one of the Patroni settings.

Args:
ops_test: ops_test instance.
setting: the name of the setting.
value: the value to assign to the setting.
tls: if Patroni is serving using tls.
"""
if tls:
schema = "https"
else:
schema = "http"
for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)):
with attempt:
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)
unit_ip = await get_unit_address(ops_test, primary_name)
requests.patch(
f"http://{unit_ip}:8008/config",
f"{schema}://{unit_ip}:8008/config",
json={setting: value},
verify=not tls,
)


Expand Down Expand Up @@ -404,22 +412,27 @@ def get_host_ip(host: str) -> str:
return member_ips


async def get_patroni_setting(ops_test: OpsTest, setting: str) -> Optional[int]:
async def get_patroni_setting(ops_test: OpsTest, setting: str, tls: bool = False) -> Optional[int]:
"""Get the value of one of the integer Patroni settings.

Args:
ops_test: ops_test instance.
setting: the name of the setting.
tls: if Patroni is serving using tls.

Returns:
the value of the configuration or None if it's using the default value.
"""
if tls:
schema = "https"
else:
schema = "http"
for attempt in Retrying(stop=stop_after_delay(30 * 2), wait=wait_fixed(3)):
with attempt:
app = await app_name(ops_test)
primary_name = await get_primary(ops_test, app)
unit_ip = await get_unit_address(ops_test, primary_name)
configuration_info = requests.get(f"http://{unit_ip}:8008/config")
configuration_info = requests.get(f"{schema}://{unit_ip}:8008/config", verify=not tls)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please share the reason for verify=not tls?

primary_start_timeout = configuration_info.json().get(setting)
return int(primary_start_timeout) if primary_start_timeout is not None else None

Expand Down
46 changes: 34 additions & 12 deletions tests/integration/test_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from tenacity import Retrying, stop_after_delay, wait_fixed

from . import architecture, markers
from .ha_tests.helpers import (
change_patroni_setting,
)
from .helpers import (
DATABASE_APP_NAME,
build_and_deploy,
Expand Down Expand Up @@ -55,7 +58,6 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
await build_and_deploy(ops_test, DATABASE_UNITS, wait_for_idle=False)


@pytest.mark.group(1)
async def check_tls_rewind(ops_test: OpsTest) -> None:
"""Checks if TLS was used by rewind."""
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
Expand All @@ -76,23 +78,16 @@ async def check_tls_rewind(ops_test: OpsTest) -> None:


@pytest.mark.group(1)
@markers.amd64_only # mattermost-k8s charm not available for arm64
async def test_mattermost_db(ops_test: OpsTest) -> None:
"""Deploy Mattermost to test the 'db' relation.

Mattermost needs TLS enabled on PostgreSQL to correctly connect to it.

Args:
ops_test: The ops test framework
"""
@pytest.mark.abort_on_fail
async def test_tls(ops_test: OpsTest) -> None:
async with ops_test.fast_forward():
# Deploy TLS Certificates operator.
await ops_test.model.deploy(
tls_certificates_app_name, config=tls_config, channel=tls_channel
)
# Relate it to the PostgreSQL to enable TLS.
await ops_test.model.relate(DATABASE_APP_NAME, tls_certificates_app_name)
await ops_test.model.wait_for_idle(status="active", timeout=1000)
await ops_test.model.wait_for_idle(status="active", timeout=1000, raise_on_error=False)

# Wait for all units enabling TLS.
for unit in ops_test.model.applications[DATABASE_APP_NAME].units:
Expand Down Expand Up @@ -122,7 +117,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", idle_period=30
)
# Pause Patroni so it doesn't wipe the custom changes
await change_patroni_setting(ops_test, "pause", True, tls=True)

async with ops_test.fast_forward("24h"):
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(2), reraise=True):
with attempt:
# Promote the replica to primary.
Expand Down Expand Up @@ -154,7 +152,13 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:

# Stop the initial primary.
logger.info(f"stopping database on {primary}")
await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql")
try:
await run_command_on_unit(ops_test, primary, "/charm/bin/pebble stop postgresql")
except Exception as e:
# pebble stop on juju 2 errors out and leaves dangling PG processes
if juju_major_version > 2:
raise e
await run_command_on_unit(ops_test, primary, "pkill --signal SIGTERM -x postgres")
Comment on lines +155 to +161
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On Juju 2 if the pebble stop command fails, the Patroni process is terminated, but a dangling Postgresql process remains and prevents Patroni from starting again.

Alternatively, we can do this check and termination in the charm code, but I would prefer not to add Juju 2 specific live code. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On 2.9 if patroni stop failed: kill -SIGTERM $postgresql_pid ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On 2.9 if patroni stop failed: kill -SIGTERM $postgresql_pid ?

Pretty much. We can do this in the ChangeError in the restart check. Question is do we want to add that code for Juju 2's sake?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer that we keep this SIGTERM call in the test code and not in the charm code. Even if it is a SIGTERM, I'd prefer either to wait for the database process to gracefully finish by itself or analyze the situation manually before doing any action to avoid any data corruption.


# Check that the primary changed.
assert await primary_changed(ops_test, primary), "primary not changed"
Expand All @@ -163,12 +167,26 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:
for attempt in Retrying(stop=stop_after_delay(60 * 3), wait=wait_fixed(2), reraise=True):
with attempt:
await check_tls_rewind(ops_test)
await change_patroni_setting(ops_test, "pause", False, tls=True)

async with ops_test.fast_forward():
# Await for postgresql to be stable if not already
await ops_test.model.wait_for_idle(
apps=[DATABASE_APP_NAME], status="active", idle_period=15
)


@pytest.mark.group(1)
@markers.amd64_only # mattermost-k8s charm not available for arm64
async def test_mattermost_db(ops_test: OpsTest) -> None:
"""Deploy Mattermost to test the 'db' relation.

Mattermost needs TLS enabled on PostgreSQL to correctly connect to it.

Args:
ops_test: The ops test framework
"""
async with ops_test.fast_forward():
# Deploy and check Mattermost user and database existence.
relation_id = await deploy_and_relate_application_with_postgresql(
ops_test, "mattermost-k8s", MATTERMOST_APP_NAME, APPLICATION_UNITS, status="waiting"
Expand All @@ -179,6 +197,10 @@ async def test_mattermost_db(ops_test: OpsTest) -> None:

await check_database_users_existence(ops_test, mattermost_users, [])


@pytest.mark.group(1)
async def test_remove_tls(ops_test: OpsTest) -> None:
async with ops_test.fast_forward():
Comment on lines +200 to +203
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Splitting off TLS removal as well.

# Remove the relation.
await ops_test.model.applications[DATABASE_APP_NAME].remove_relation(
f"{DATABASE_APP_NAME}:certificates", f"{tls_certificates_app_name}:certificates"
Expand Down
Loading