Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,6 @@ create-replication:
default: default
get-primary:
description: Get the unit with is the primary/leader in the replication.
get-password:
description: Get a charm system user's password.
Useful for manual troubleshooting and for backing up cluster credentials.
It cannot be used for application integration relations.
params:
username:
type: string
description: The username, the default value 'operator'.
Possible values - backup, operator, replication, rewind, patroni.
list-backups:
description: Lists backups in s3 storage in AWS.
pre-upgrade-check:
Expand Down Expand Up @@ -56,17 +47,6 @@ restore:
description: Point-in-time-recovery target in PSQL format.
resume-upgrade:
description: Resume a rolling upgrade after asserting successful upgrade of a new revision.
set-password:
description: Change the system user's password, which is used by charm.
It is for internal charm users and SHOULD NOT be used by applications.
params:
username:
type: string
description: The username, the default value 'operator'.
Possible values - backup, operator, replication rewind.
password:
type: string
description: The password will be auto-generated if this option is not specified.
set-tls-private-key:
description: Set the private key, which will be used for certificate signing requests (CSR). Run for each unit separately.
params:
Expand Down
8 changes: 8 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,14 @@ options:
Allowed values are: from -1 to 86400.
type: int
default: -1
system-users:
type: secret
description: |
Configure the internal system users and their passwords. The passwords will
be auto-generated if this option is not set. It is for internal use only
and SHOULD NOT be used by applications. This needs to be a Juju Secret URI pointing
to a secret that contains the following content: `<username>: <password>`.
Possible users: backup, monitoring, operator, replication, rewind.
vacuum_autovacuum_analyze_scale_factor:
description: |
Specifies a fraction of the table size to add to autovacuum_vacuum_threshold when
Expand Down
178 changes: 102 additions & 76 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
HookEvent,
LeaderElectedEvent,
RelationDepartedEvent,
SecretChangedEvent,
WorkloadEvent,
)
from ops.model import (
Expand All @@ -68,6 +69,7 @@
MaintenanceStatus,
ModelError,
Relation,
SecretNotFoundError,
Unit,
UnknownStatus,
WaitingStatus,
Expand Down Expand Up @@ -211,12 +213,12 @@ def __init__(self, *args):
self.framework.observe(self.on.leader_elected, self._on_leader_elected)
self.framework.observe(self.on[PEER].relation_changed, self._on_peer_relation_changed)
self.framework.observe(self.on.secret_changed, self._on_peer_relation_changed)
# add specific handler for updated system-user secrets
self.framework.observe(self.on.secret_changed, self._on_secret_changed)
self.framework.observe(self.on[PEER].relation_departed, self._on_peer_relation_departed)
self.framework.observe(self.on.postgresql_pebble_ready, self._on_postgresql_pebble_ready)
self.framework.observe(self.on.pgdata_storage_detaching, self._on_pgdata_storage_detaching)
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
self.framework.observe(self.on.get_primary_action, self._on_get_primary)
self.framework.observe(self.on.update_status, self._on_update_status)
Expand Down Expand Up @@ -380,6 +382,25 @@ def remove_secret(self, scope: Scopes, key: str) -> None:

self.peer_relation_data(scope).delete_relation_data(peers.id, [secret_key])

def get_secret_from_id(self, secret_id: str) -> dict[str, str]:
"""Resolve the given id of a Juju secret and return the content as a dict.

This method can be used to retrieve any secret, not just those used via the peer relation.
If the secret is not owned by the charm, it has to be granted access to it.

Args:
secret_id (str): The id of the secret.

Returns:
dict: The content of the secret.
"""
try:
secret_content = self.model.get_secret(id=secret_id).get_content(refresh=True)
except (SecretNotFoundError, ModelError):
raise

return secret_content

@property
def is_cluster_initialised(self) -> bool:
"""Returns whether the cluster is already initialised."""
Expand Down Expand Up @@ -662,6 +683,17 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901

self.async_replication.handle_read_only_mode()

def _on_secret_changed(self, event: SecretChangedEvent) -> None:
"""Handle the secret_changed event."""
if not self.unit.is_leader():
return

if (admin_secret_id := self.config.system_users) and admin_secret_id == event.secret.id:
try:
self._update_admin_password(admin_secret_id)
except PostgreSQLUpdateUserPasswordError:
event.defer()

def _on_config_changed(self, event) -> None:
"""Handle configuration changes, like enabling plugins."""
if not self.is_cluster_initialised:
Expand Down Expand Up @@ -703,6 +735,12 @@ def _on_config_changed(self, event) -> None:
# Enable and/or disable the extensions.
self.enable_disable_extensions()

if admin_secret_id := self.config.system_users:
try:
self._update_admin_password(admin_secret_id)
except PostgreSQLUpdateUserPasswordError:
event.defer()

def enable_disable_extensions(self, database: str | None = None) -> None:
"""Enable/disable PostgreSQL extensions set through config options.

Expand Down Expand Up @@ -858,6 +896,17 @@ def _get_hostname_from_unit(self, member: str) -> str:

def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
"""Handle the leader-elected event."""
# consider configured system user passwords
system_user_passwords = {}
if admin_secret_id := self.config.system_users:
try:
system_user_passwords = self.get_secret_from_id(secret_id=admin_secret_id)
except (ModelError, SecretNotFoundError) as e:
# only display the error but don't return to make sure all users have passwords
logger.error(f"Error setting internal passwords: {e}")
self.unit.status = BlockedStatus("Password setting for system users failed.")
event.defer()

for password in {
USER_PASSWORD_KEY,
REPLICATION_PASSWORD_KEY,
Expand All @@ -866,7 +915,14 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
PATRONI_PASSWORD_KEY,
}:
if self.get_secret(APP_SCOPE, password) is None:
self.set_secret(APP_SCOPE, password, new_password())
if password in system_user_passwords:
# use provided passwords for system-users if available
self.set_secret(APP_SCOPE, password, system_user_passwords[password])
logger.info(f"Using configured password for {password}")
else:
# generate a password for this user if not provided
self.set_secret(APP_SCOPE, password, new_password())
logger.info(f"Generated new password for {password}")

# Add this unit to the list of cluster members
# (the cluster should start with only this member).
Expand Down Expand Up @@ -1202,66 +1258,22 @@ def _has_non_restore_waiting_status(self) -> bool:
and not self.is_cluster_restoring_to_time
)

def _on_get_password(self, event: ActionEvent) -> None:
"""Returns the password for a user as an action response.

If no user is provided, the password of the operator user is returned.
"""
username = event.params.get("username", USER)
if username not in PASSWORD_USERS and self.is_ldap_enabled:
event.fail("The action can be run only for system users when LDAP is enabled")
return
if username not in PASSWORD_USERS:
event.fail(
f"The action can be run only for system users or Patroni:"
f" {', '.join(PASSWORD_USERS)} not {username}"
)
return

event.set_results({"password": self.get_secret(APP_SCOPE, f"{username}-password")})

def _on_set_password(self, event: ActionEvent) -> None: # noqa: C901
"""Set the password for the specified user."""
# Only leader can write the new password into peer relation.
if not self.unit.is_leader():
event.fail("The action can be run only on leader unit")
return

username = event.params.get("username", USER)
if username not in SYSTEM_USERS and self.is_ldap_enabled:
event.fail("The action can be run only for system users when LDAP is enabled")
return
if username not in SYSTEM_USERS:
event.fail(
f"The action can be run only for system users:"
f" {', '.join(SYSTEM_USERS)} not {username}"
)
return

password = new_password()
if "password" in event.params:
password = event.params["password"]

if password == self.get_secret(APP_SCOPE, f"{username}-password"):
event.log("The old and new passwords are equal.")
event.set_results({"password": password})
return

# Ensure all members are ready before trying to reload Patroni
# configuration to avoid errors (like the API not responding in
# one instance because PostgreSQL and/or Patroni are not ready).
def _update_admin_password(self, admin_secret_id: str) -> None:
"""Check if the password of a system user was changed and update it in the database."""
if not self._patroni.are_all_members_ready():
event.fail(
# Ensure all members are ready before reloading Patroni configuration to avoid errors
# e.g. API not responding in one instance because PostgreSQL / Patroni are not ready
raise PostgreSQLUpdateUserPasswordError(
"Failed changing the password: Not all members healthy or finished initial sync."
)
return

# cross-cluster replication: extract the database host on which to update the passwords
replication_offer_relation = self.model.get_relation(REPLICATION_OFFER_RELATION)
other_cluster_primary_ip = ""
if (
replication_offer_relation is not None
and not self.async_replication.is_primary_cluster()
):
# Update the password in the other cluster PostgreSQL primary instance.
other_cluster_endpoints = self.async_replication.get_all_primary_cluster_endpoints()
other_cluster_primary = self._patroni.get_primary(
alternative_endpoints=other_cluster_endpoints
Expand All @@ -1271,37 +1283,51 @@ def _on_set_password(self, event: ActionEvent) -> None: # noqa: C901
for unit in replication_offer_relation.units
if unit.name.replace("/", "-") == other_cluster_primary
)
try:
self.postgresql.update_user_password(
username, password, database_host=other_cluster_primary_ip
)
except PostgreSQLUpdateUserPasswordError as e:
logger.exception(e)
event.fail("Failed changing the password.")
return
elif self.model.get_relation(REPLICATION_CONSUMER_RELATION) is not None:
event.fail(
"Failed changing the password: This action can be ran only in the cluster from the offer side."
logger.error(
"Failed changing the password: This can be ran only in the cluster from the offer side."
)
self.unit.status = BlockedStatus("Password update for system users failed.")
return
else:
# Update the password in this cluster PostgreSQL primary instance.
try:
self.postgresql.update_user_password(username, password)
except PostgreSQLUpdateUserPasswordError as e:
logger.exception(e)
event.fail("Failed changing the password.")
return

# Update the password in the secret store.
self.set_secret(APP_SCOPE, f"{username}-password", password)
try:
# get the secret content and check each user configured there
# only SYSTEM_USERS with changed passwords are processed, all others ignored
updated_passwords = self.get_secret_from_id(secret_id=admin_secret_id)
for user, password in list(updated_passwords.items()):
if user not in SYSTEM_USERS:
logger.error(
f"Can only update system users: {', '.join(SYSTEM_USERS)} not {user}"
)
updated_passwords.pop(user)
continue
if password == self.get_secret(APP_SCOPE, f"{user}-password"):
updated_passwords.pop(user)
except (ModelError, SecretNotFoundError) as e:
logger.error(f"Error updating internal passwords: {e}")
self.unit.status = BlockedStatus("Password update for system users failed.")
return

try:
# perform the actual password update for the remaining users
for user, password in updated_passwords.items():
logger.info(f"Updating password for user {user}")
self.postgresql.update_user_password(
user,
password,
database_host=other_cluster_primary_ip if other_cluster_primary_ip else None,
)
# Update the password in the secret store after updating it in the database
self.set_secret(APP_SCOPE, f"{user}-password", password)
except PostgreSQLUpdateUserPasswordError as e:
logger.exception(e)
self.unit.status = BlockedStatus("Password update for system users failed.")
return

# Update and reload Patroni configuration in this unit to use the new password.
# Other units Patroni configuration will be reloaded in the peer relation changed event.
self.update_config()

event.set_results({"password": password})

def _on_promote_to_primary(self, event: ActionEvent) -> None:
if event.params.get("scope") == "cluster":
return self.async_replication.promote_to_primary(event)
Expand Down
1 change: 1 addition & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ class CharmConfig(BaseConfigModel):
storage_default_table_access_method: str | None
storage_gin_pending_list_limit: int | None
storage_old_snapshot_threshold: int | None
system_users: str | None
vacuum_autovacuum_analyze_scale_factor: float | None
vacuum_autovacuum_analyze_threshold: int | None
vacuum_autovacuum_freeze_max_age: int | None
Expand Down
1 change: 1 addition & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
SECRET_CACHE_LABEL = "cache" # noqa: S105
SECRET_INTERNAL_LABEL = "internal-secret" # noqa: S105
SECRET_DELETED_LABEL = "None" # noqa: S105
SYSTEM_USERS_PASSWORD_CONFIG = "system-users" # noqa: S105

APP_SCOPE = "app"
UNIT_SCOPE = "unit"
Expand Down
7 changes: 3 additions & 4 deletions tests/integration/ha_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
db_connect,
execute_query_on_unit,
get_password,
get_password_on_unit,
get_primary,
get_unit_address,
run_command_on_unit,
Expand Down Expand Up @@ -317,7 +316,7 @@ async def count_writes(
) -> tuple[dict[str, int], dict[str, int]]:
"""Count the number of writes in the database."""
app = await app_name(ops_test)
password = await get_password(ops_test, database_app_name=app, down_unit=down_unit)
password = await get_password(ops_test, database_app_name=app)
members = []
for model in [ops_test.model, extra_model]:
if model is None:
Expand Down Expand Up @@ -1015,7 +1014,7 @@ async def create_db(ops_test: OpsTest, app: str, db: str) -> None:
"""Creates database with specified name."""
unit = ops_test.model.applications[app].units[0]
unit_address = await get_unit_address(ops_test, unit.name)
password = await get_password_on_unit(ops_test, "operator", unit, app)
password = await get_password(ops_test, "operator", app)

conn = db_connect(unit_address, password)
conn.autocommit = True
Expand All @@ -1030,7 +1029,7 @@ async def check_db(ops_test: OpsTest, app: str, db: str) -> bool:
"""Returns True if database with specified name already exists."""
unit = ops_test.model.applications[app].units[0]
unit_address = await get_unit_address(ops_test, unit.name)
password = await get_password_on_unit(ops_test, "operator", unit, app)
password = await get_password(ops_test, "operator", app)

query = await execute_query_on_unit(
unit_address,
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/ha_tests/test_self_healing_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ async def test_forceful_restart_without_data_and_transaction_logs(
logger.info(f"rotating WAL segments on {new_primary_name}")
files = await list_wal_files(ops_test, app)
host = await get_unit_address(ops_test, new_primary_name)
password = await get_password(ops_test, down_unit=primary_name)
password = await get_password(ops_test)
with db_connect(host, password) as connection:
connection.autocommit = True
with connection.cursor() as cursor:
Expand Down
Loading