Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
81f055d
More aggressive idle checks
dragomirp Jun 18, 2025
9456b49
Explicit idle
dragomirp Jun 18, 2025
09ca1e5
Idle period when relating to the test app
dragomirp Jun 18, 2025
1fa040e
Remove second start
dragomirp Jun 19, 2025
daff032
Remove log warning
dragomirp Jun 19, 2025
8b0e099
Hold create db hook for longer
dragomirp Jun 19, 2025
efde10d
Bump the pg_hba checker timeout
dragomirp Jun 19, 2025
f6de4ae
Don't update config
dragomirp Jun 19, 2025
aaada84
Bump timeout
dragomirp Jun 19, 2025
8f42b60
Merge branch 'main' into fix-self-healing
dragomirp Jun 20, 2025
5fb6e65
Try to just append to pg_hba
dragomirp Jun 20, 2025
2b421e0
Merge branch 'main' into fix-self-healing
dragomirp Jun 24, 2025
a2baa62
Sync hba changes before creating db resources
dragomirp Jun 25, 2025
2d3d97f
Force regenerate hash and config on leader
dragomirp Jun 25, 2025
71e392d
Use current host to check hba
dragomirp Jun 25, 2025
d3edfba
Update libs
dragomirp Jun 25, 2025
5978879
Compare to local hash
dragomirp Jun 25, 2025
e04b552
Cla check for 16/edge
dragomirp Jun 25, 2025
41da715
Don't defer peer change before init
dragomirp Jun 25, 2025
12ba662
Add back app check
dragomirp Jun 25, 2025
a264193
Revert back to just updating peer data
dragomirp Jun 25, 2025
d8c0a00
Only sync hba once initially set
dragomirp Jun 25, 2025
2226c0a
Bump timeout
dragomirp Jun 25, 2025
2efe251
Merge branch 'main' into fix-self-healing
dragomirp Jun 26, 2025
cb5b320
Merge branch 'main' into fix-self-healing
dragomirp Jun 26, 2025
85cdbe7
Don't filter appends to pg_hba
dragomirp Jun 26, 2025
49574a8
Append the rel users directly to the user map
dragomirp Jul 1, 2025
5d5837b
Merge branch 'main' into fix-self-healing
dragomirp Jul 2, 2025
f94f6f3
Add idle timeout
dragomirp Jul 3, 2025
d9add35
Remove trigger
dragomirp Jul 15, 2025
fad3ed7
Merge branch 'main' into fix-self-healing
dragomirp Jul 15, 2025
4f23dab
Sleep longer
dragomirp Jul 15, 2025
af03cd0
Set extra user roles
dragomirp Jul 15, 2025
62834af
Always update hash
dragomirp Jul 16, 2025
80ee8f8
Bump sleep period
dragomirp Jul 16, 2025
78aef57
Revert the trigger
dragomirp Jul 16, 2025
3a39640
Move generate_user_hash to charm
dragomirp Jul 17, 2025
f80167e
Conditional hash update
dragomirp Jul 18, 2025
6844c7d
Try to sort keys
dragomirp Jul 19, 2025
44615bc
Revert to relation user hash
dragomirp Jul 19, 2025
cec1ec0
Try to reduce the amount of ifs
dragomirp Jul 21, 2025
636c16f
Remove trigger
dragomirp Jul 22, 2025
d59b9ff
Blocked test app
dragomirp Jul 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 1 addition & 78 deletions lib/charms/postgresql_k8s/v0/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 53
LIBPATCH = 54

# Groups to distinguish HBA access
ACCESS_GROUP_IDENTITY = "identity_access"
Expand Down Expand Up @@ -782,83 +782,6 @@ def set_up_database(self) -> None:
connection = None
cursor = None
try:
with self._connect_to_database(
database="template1"
) as connection, connection.cursor() as cursor:
# Create database function and event trigger to identify users created by PgBouncer.
cursor.execute(
"SELECT TRUE FROM pg_event_trigger WHERE evtname = 'update_pg_hba_on_create_schema';"
)
if cursor.fetchone() is None:
cursor.execute("""
CREATE OR REPLACE FUNCTION update_pg_hba()
RETURNS event_trigger
LANGUAGE plpgsql
AS $$
DECLARE
hba_file TEXT;
copy_command TEXT;
connection_type TEXT;
rec record;
insert_value TEXT;
changes INTEGER = 0;
BEGIN
-- Don't execute on replicas.
IF NOT pg_is_in_recovery() THEN
-- Load the current authorisation rules.
DROP TABLE IF EXISTS pg_hba;
CREATE TEMPORARY TABLE pg_hba (lines TEXT);
SELECT setting INTO hba_file FROM pg_settings WHERE name = 'hba_file';
IF hba_file IS NOT NULL THEN
copy_command='COPY pg_hba FROM ''' || hba_file || '''' ;
EXECUTE copy_command;
-- Build a list of the relation users and the databases they can access.
DROP TABLE IF EXISTS relation_users;
CREATE TEMPORARY TABLE relation_users AS
SELECT t.user, STRING_AGG(DISTINCT t.database, ',') AS databases FROM( SELECT u.usename AS user, CASE WHEN u.usesuper THEN 'all' ELSE d.datname END AS database FROM ( SELECT usename, usesuper FROM pg_catalog.pg_user WHERE usename NOT IN ('backup', 'monitoring', 'operator', 'postgres', 'replication', 'rewind')) AS u JOIN ( SELECT datname FROM pg_catalog.pg_database WHERE NOT datistemplate ) AS d ON has_database_privilege(u.usename, d.datname, 'CONNECT') ) AS t GROUP BY 1;
IF (SELECT COUNT(lines) FROM pg_hba WHERE lines LIKE 'hostssl %') > 0 THEN
connection_type := 'hostssl';
ELSE
connection_type := 'host';
END IF;
-- Add the new users to the pg_hba file.
FOR rec IN SELECT * FROM relation_users
LOOP
insert_value := connection_type || ' ' || rec.databases || ' ' || rec.user || ' 0.0.0.0/0 md5';
IF (SELECT COUNT(lines) FROM pg_hba WHERE lines = insert_value) = 0 THEN
INSERT INTO pg_hba (lines) VALUES (insert_value);
changes := changes + 1;
END IF;
END LOOP;
-- Remove users that don't exist anymore from the pg_hba file.
FOR rec IN SELECT h.lines FROM pg_hba AS h LEFT JOIN relation_users AS r ON SPLIT_PART(h.lines, ' ', 3) = r.user WHERE r.user IS NULL AND (SPLIT_PART(h.lines, ' ', 3) LIKE 'relation_id_%' OR SPLIT_PART(h.lines, ' ', 3) LIKE 'pgbouncer_auth_relation_%' OR SPLIT_PART(h.lines, ' ', 3) LIKE '%_user_%_%')
LOOP
DELETE FROM pg_hba WHERE lines = rec.lines;
changes := changes + 1;
END LOOP;
-- Apply the changes to the pg_hba file.
IF changes > 0 THEN
copy_command='COPY pg_hba TO ''' || hba_file || '''' ;
EXECUTE copy_command;
PERFORM pg_reload_conf();
END IF;
END IF;
END IF;
END;
$$;
""")
cursor.execute("""
CREATE EVENT TRIGGER update_pg_hba_on_create_schema
ON ddl_command_end
WHEN TAG IN ('CREATE SCHEMA')
EXECUTE FUNCTION update_pg_hba();
""")
cursor.execute("""
CREATE EVENT TRIGGER update_pg_hba_on_drop_schema
ON ddl_command_end
WHEN TAG IN ('DROP SCHEMA')
EXECUTE FUNCTION update_pg_hba();
""")
with self._connect_to_database() as connection, connection.cursor() as cursor:
cursor.execute("SELECT TRUE FROM pg_roles WHERE rolname='admin';")
if cursor.fetchone() is None:
Expand Down
75 changes: 54 additions & 21 deletions scripts/authorisation_rules_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,66 @@
from urllib.parse import urljoin
from urllib.request import urlopen

import psycopg2
import yaml

API_REQUEST_TIMEOUT = 5
PATRONI_CLUSTER_STATUS_ENDPOINT = "cluster"
PATRONI_CONFIG_STATUS_ENDPOINT = "config"
PATRONI_CONF_FILE_PATH = "/var/lib/postgresql/data/patroni.yml"

# File path for the spawned cluster topology observer process to write logs.
LOG_FILE_PATH = "/var/log/authorisation_rules_observer.log"


class UnreachableUnitsError(Exception):
"""Cannot reach any known cluster member."""


def dispatch(run_cmd, unit, charm_dir):
"""Use the input juju-run command to dispatch a :class:`AuthorisationRulesChangeEvent`."""
dispatch_sub_cmd = "JUJU_DISPATCH_PATH=hooks/authorisation_rules_change {}/dispatch"
def dispatch(run_cmd, unit, charm_dir, custom_event):
"""Use the input juju-run command to dispatch a custom event."""
dispatch_sub_cmd = "JUJU_DISPATCH_PATH=hooks/{} {}/dispatch"
# Input is generated by the charm
subprocess.run([run_cmd, "-u", unit, dispatch_sub_cmd.format(charm_dir)]) # noqa: S603
subprocess.run([run_cmd, "-u", unit, dispatch_sub_cmd.format(custom_event, charm_dir)]) # noqa: S603


def check_for_database_changes(run_cmd, unit, charm_dir, previous_databases):
"""Check for changes in the databases.

If changes are detected, dispatch an event to handle them.
"""
with open(PATRONI_CONF_FILE_PATH) as conf_file:
conf_file_contents = yaml.safe_load(conf_file)
password = conf_file_contents["postgresql"]["authentication"]["superuser"]["password"]
connection = None
try:
# Input is generated by the charm
with (
psycopg2.connect(
"dbname='postgres' user='operator' host='localhost' "
f"password='{password}' connect_timeout=1"
) as connection,
connection.cursor() as cursor,
):
cursor.execute("SELECT datname, datacl FROM pg_database;")
current_databases = cursor.fetchall()
except psycopg2.Error as e:
with open(LOG_FILE_PATH, "a") as log_file:
log_file.write(f"Failed to retrieve databases: {e}\n")
return previous_databases
else:
# If it's the first time the databases were retrieved, then store it and use
# it for subsequent checks.
if not previous_databases:
previous_databases = current_databases
# If the databases changed, dispatch a charm event to handle this change.
elif current_databases != previous_databases:
previous_databases = current_databases
dispatch(run_cmd, unit, charm_dir, "databases_change")
return previous_databases
finally:
if connection:
connection.close()


def main():
Expand All @@ -34,7 +80,7 @@ def main():
"""
patroni_urls, run_cmd, unit, charm_dir = sys.argv[1:]

previous_authorisation_rules = []
previous_databases = None
urls = [urljoin(url, PATRONI_CLUSTER_STATUS_ENDPOINT) for url in patroni_urls.split(",")]
member_name = unit.replace("/", "-")
while True:
Expand Down Expand Up @@ -67,22 +113,9 @@ def main():
break

if is_primary:
# Read contents from the pg_hba.conf file.
with open("/var/lib/postgresql/data/pgdata/pg_hba.conf") as file:
current_authorisation_rules = file.read()
current_authorisation_rules = [
line
for line in current_authorisation_rules.splitlines()
if not line.startswith("#")
]
# If it's the first time the authorisation rules were retrieved, then store it and use
# it for subsequent checks.
if not previous_authorisation_rules:
previous_authorisation_rules = current_authorisation_rules
# If the authorisation rules changed, dispatch a charm event to handle this change.
elif current_authorisation_rules != previous_authorisation_rules:
previous_authorisation_rules = current_authorisation_rules
dispatch(run_cmd, unit, charm_dir)
previous_databases = check_for_database_changes(
run_cmd, unit, charm_dir, previous_databases
)

# Wait some time before checking again for a authorisation rules change.
sleep(30)
Expand Down
24 changes: 20 additions & 4 deletions src/authorisation_rules_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import signal
import subprocess
import typing
from pathlib import Path
from sys import version_info

from ops.charm import CharmEvents
from ops.framework import EventBase, EventSource, Object
Expand All @@ -22,17 +24,17 @@
LOG_FILE_PATH = "/var/log/authorisation_rules_observer.log"


class AuthorisationRulesChangeEvent(EventBase):
"""A custom event for authorisation rules changes."""
class DatabasesChangeEvent(EventBase):
"""A custom event for databases changes."""


class AuthorisationRulesChangeCharmEvents(CharmEvents):
"""A CharmEvents extension for authorisation rules changes.

Includes :class:`AuthorisationRulesChangeEvent` in those that can be handled.
Includes :class:`DatabasesChangeEventt` in those that can be handled.
"""

authorisation_rules_change = EventSource(AuthorisationRulesChangeEvent)
databases_change = EventSource(DatabasesChangeEvent)


class AuthorisationRulesObserver(Object):
Expand Down Expand Up @@ -74,6 +76,20 @@ def start_authorisation_rules_observer(self):
# in a hook context, as Juju will disallow use of juju-run.
new_env = os.environ.copy()
new_env.pop("JUJU_CONTEXT_ID", None)
# Generate the venv path based on the existing lib path
for loc in new_env["PYTHONPATH"].split(":"):
path = Path(loc)
venv_path = (
path
/ ".."
/ "venv"
/ "lib"
/ f"python{version_info.major}.{version_info.minor}"
/ "site-packages"
)
if path.stem == "lib":
new_env["PYTHONPATH"] = f"{venv_path.resolve()}:{new_env['PYTHONPATH']}"
break

urls = [
self._charm._patroni._patroni_url.replace(self._charm.endpoint, endpoint)
Expand Down
44 changes: 35 additions & 9 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

"""Charmed Kubernetes Operator for the PostgreSQL database."""

import datetime
import itertools
import json
import logging
Expand All @@ -13,6 +12,8 @@
import shutil
import sys
import time
from datetime import datetime
from hashlib import shake_128
from pathlib import Path
from typing import Literal, get_args
from urllib.parse import urlparse
Expand Down Expand Up @@ -221,9 +222,7 @@ def __init__(self, *args):
"/usr/bin/juju-exec" if self.model.juju_version.major > 2 else "/usr/bin/juju-run"
)
self._observer = AuthorisationRulesObserver(self, run_cmd)
self.framework.observe(
self.on.authorisation_rules_change, self._on_authorisation_rules_change
)
self.framework.observe(self.on.databases_change, self._on_databases_change)
self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.leader_elected, self._on_leader_elected)
self.framework.observe(self.on[PEER].relation_changed, self._on_peer_relation_changed)
Expand Down Expand Up @@ -281,9 +280,11 @@ def __init__(self, *args):
self, relation_name=TRACING_RELATION_NAME, protocols=[TRACING_PROTOCOL]
)

def _on_authorisation_rules_change(self, _):
"""Handle authorisation rules change event."""
timestamp = datetime.datetime.now()
def _on_databases_change(self, _):
"""Handle databases change event."""
self.update_config()
logger.debug("databases changed")
timestamp = datetime.now()
self._peers.data[self.unit].update({"pg_hba_needs_update_timestamp": str(timestamp)})
logger.debug(f"authorisation rules changed at {timestamp}")

Expand Down Expand Up @@ -580,14 +581,14 @@ def _on_peer_relation_changed(self, event: HookEvent) -> None: # noqa: C901
if self.unit.is_leader():
if self._initialize_cluster(event):
logger.debug("Deferring on_peer_relation_changed: Leader initialized cluster")
event.defer()
else:
logger.debug("_initialized_cluster failed on _peer_relation_changed")
return
else:
logger.debug(
"Deferring on_peer_relation_changed: Cluster must be initialized before members can join"
"Early exit on_peer_relation_changed: Cluster must be initialized before members can join"
)
event.defer()
return

# If the leader is the one receiving the event, it adds the new members,
Expand Down Expand Up @@ -2119,6 +2120,9 @@ def update_config(self, is_creating_backup: bool = False) -> bool:
self._restart_metrics_service()
self._restart_ldap_sync_service()

self.unit_peer_data.update({"user_hash": self.generate_user_hash})
if self.unit.is_leader():
self.app_peer_data.update({"user_hash": self.generate_user_hash})
return True

def _validate_config_options(self) -> None:
Expand Down Expand Up @@ -2316,8 +2320,30 @@ def relations_user_databases_map(self) -> dict:
user, current_host=self.is_connectivity_enabled
)
)

# Copy relations users directly instead of waiting for them to be created
for relation in self.model.relations[self.postgresql_client_relation.relation_name]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also copy the legacy users as well here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be ideal, but I don't know if apps connecting through the legacy relation are using replicas. I think we can keep only the new relation users for now, so we keep the same code for 16/edge.

user = f"relation_id_{relation.id}"
if user not in user_database_map and (
database := self.postgresql_client_relation.database_provides.fetch_relation_field(
relation.id, "database"
)
):
user_database_map[user] = database
return user_database_map

@property
def generate_user_hash(self) -> str:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried to hash the relations_user_databases_map property, but that never idled in the new relation integration test.

"""Generate expected user and database hash."""
user_db_pairs = {}
for relation in self.model.relations[self.postgresql_client_relation.relation_name]:
if database := self.postgresql_client_relation.database_provides.fetch_relation_field(
relation.id, "database"
):
user = f"relation_id_{relation.id}"
user_db_pairs[user] = database
return shake_128(str(user_db_pairs).encode()).hexdigest(16)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dict key order should be deterministic for units on the same platform.


def override_patroni_on_failure_condition(
self, new_condition: str, repeat_cause: str | None
) -> bool:
Expand Down
Loading