Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
84e0b8b
Add pgbackrest
marceloneppel Sep 12, 2022
2c1bb07
Merge branch 'main' into pgbackrest-install
marceloneppel Sep 22, 2022
59c2c33
Add initial backup settings
marceloneppel Sep 23, 2022
3ccf6ea
Add additional backup settings
marceloneppel Sep 27, 2022
361597c
Remove settings
marceloneppel Sep 27, 2022
e942b0b
Rename user
marceloneppel Sep 27, 2022
df5b66c
Add test for TLS being used on pg_rewind connections
marceloneppel Sep 28, 2022
1f94e41
Remove table creation
marceloneppel Sep 29, 2022
c9335d3
Readd write to the database
marceloneppel Sep 29, 2022
6169761
Change bootstrap contraints
marceloneppel Sep 30, 2022
513f68c
Change the way the service is stopped
marceloneppel Oct 3, 2022
c2bdd31
Add one more call to service stop
marceloneppel Oct 3, 2022
47dfd21
Change the way the service is stopped
marceloneppel Oct 3, 2022
7a46d8e
Change systemd unit
marceloneppel Oct 3, 2022
15753f8
Increase test timeout
marceloneppel Oct 3, 2022
698d36d
Remove test code
marceloneppel Oct 3, 2022
c134713
Read code
marceloneppel Oct 3, 2022
82c3ff1
Readd code
marceloneppel Oct 3, 2022
f3f3a03
Change WAL trigger mechanism
marceloneppel Oct 3, 2022
00c4f9f
Fix check
marceloneppel Oct 3, 2022
746ee8a
Improve code
marceloneppel Oct 4, 2022
541af83
Remove instance promotion
marceloneppel Oct 4, 2022
42a55c8
Readd instance promotion
marceloneppel Oct 4, 2022
5544433
Change test retry logic
marceloneppel Oct 6, 2022
b5f19ec
Remove debug calls
marceloneppel Oct 6, 2022
2f67709
Add replica reinitialization
marceloneppel Oct 6, 2022
e3110f9
Change checks order
marceloneppel Oct 31, 2022
a45d8a5
Merge branch 'main' into pgbackrest-install
marceloneppel Oct 31, 2022
304cee4
Add reinitialize call
marceloneppel Nov 15, 2022
26e6618
Improve reinitialize call
marceloneppel Nov 15, 2022
d0b70cf
Remove unused code
marceloneppel Nov 16, 2022
0fc1bb2
Merge branch 'main' into pgbackrest-install
marceloneppel Dec 6, 2022
0c6ca80
Pin OS on release workflow
marceloneppel Dec 14, 2022
f407260
Change whitelist_externals to allowlist_externals
marceloneppel Dec 14, 2022
6c2ca79
Change whitelist_externals to allowlist_externals
marceloneppel Dec 14, 2022
00c2135
Merge branch 'main' into pin-os-release-workflow
marceloneppel Jan 9, 2023
1fdb331
Merge branch 'pin-os-release-workflow' into pgbackrest-install
marceloneppel Jan 12, 2023
e9d6d80
Merge branch 'main' into pgbackrest-install
marceloneppel Jan 12, 2023
55ca135
Add API request timeout
marceloneppel Jan 12, 2023
44b63e9
Add unit tests
marceloneppel Jan 13, 2023
523a687
Remove log
marceloneppel Jan 13, 2023
76d04e8
Merge branch 'main' into pgbackrest-install
marceloneppel Jan 13, 2023
0bb1414
Revert timeout
marceloneppel Jan 13, 2023
5adae46
Extract endpoint from URL to constant
marceloneppel Jan 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ get-password:
username:
type: string
description: The username, the default value 'operator'.
Possible values - operator, replication.
Possible values - operator, replication, rewind.
set-password:
description: Change the system user's password, which is used by charm.
It is for internal charm users and SHOULD NOT be used by applications.
params:
username:
type: string
description: The username, the default value 'operator'.
Possible values - operator, replication.
Possible values - operator, replication, rewind.
password:
type: string
description: The password will be auto-generated if this option is not specified.
Expand Down
18 changes: 17 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from constants import (
PEER,
REPLICATION_PASSWORD_KEY,
REWIND_PASSWORD_KEY,
SYSTEM_USERS,
TLS_CA_FILE,
TLS_CERT_FILE,
Expand Down Expand Up @@ -420,6 +421,7 @@ def _patroni(self) -> Patroni:
self._peer_members_ips,
self._get_password(),
self._replication_password,
self.get_secret("app", REWIND_PASSWORD_KEY),
bool(self.unit_peer_data.get("tls")),
)

Expand Down Expand Up @@ -521,7 +523,9 @@ def _on_install(self, event) -> None:

# Install the PostgreSQL and Patroni requirements packages.
try:
self._install_apt_packages(event, ["postgresql", "python3-pip", "python3-psycopg2"])
self._install_apt_packages(
event, ["pgbackrest", "postgresql", "python3-pip", "python3-psycopg2"]
)
except (subprocess.CalledProcessError, apt.PackageNotFoundError):
self.unit.status = BlockedStatus("failed to install apt packages")
return
Expand Down Expand Up @@ -557,6 +561,8 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
self.set_secret("app", USER_PASSWORD_KEY, new_password())
if self.get_secret("app", REPLICATION_PASSWORD_KEY) is None:
self.set_secret("app", REPLICATION_PASSWORD_KEY, new_password())
if self.get_secret("app", REWIND_PASSWORD_KEY) is None:
self.set_secret("app", REWIND_PASSWORD_KEY, new_password())

# Update the list of the current PostgreSQL hosts when a new leader is elected.
# Add this unit to the list of cluster members
Expand Down Expand Up @@ -731,6 +737,14 @@ def _on_update_status(self, _) -> None:
self.postgresql_client_relation.oversee_users()
self._update_certificate()

# Restart the workload if it's stuck on the starting state after a restart.
if (
not self._patroni.member_started
and "postgresql_restarted" in self._peers.data[self.unit]
and self._patroni.member_replication_lag == "unknown"
):
self._patroni.reinitialize_postgresql()

def _update_certificate(self) -> None:
"""Updates the TLS certificate if the unit IP changes."""
# Update the certificate if the IP changes because the IP
Expand Down Expand Up @@ -838,6 +852,7 @@ def _restart(self, _) -> None:
"""Restart PostgreSQL."""
try:
self._patroni.restart_postgresql()
self._peers.data[self.unit]["postgresql_restarted"] = "True"
except RetryError as e:
logger.error("failed to restart PostgreSQL")
self.unit.status = BlockedStatus(f"failed to restart PostgreSQL with error {e}")
Expand All @@ -863,6 +878,7 @@ def update_config(self) -> None:
# Restart PostgreSQL if TLS configuration has changed
# (so the both old and new connections use the configuration).
if restart_postgresql:
self._peers.data[self.unit].pop("postgresql_restarted", None)
self.on[self.restart_manager.name].acquire_lock.emit()


Expand Down
66 changes: 60 additions & 6 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@
wait_fixed,
)

from constants import TLS_CA_FILE, USER
from constants import (
API_REQUEST_TIMEOUT,
PATRONI_CLUSTER_STATUS_ENDPOINT,
REWIND_USER,
TLS_CA_FILE,
USER,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,6 +70,7 @@ def __init__(
peers_ips: Set[str],
superuser_password: str,
replication_password: str,
rewind_password: str,
tls_enabled: bool,
):
"""Initialize the Patroni class.
Expand All @@ -77,6 +84,7 @@ def __init__(
planned_units: number of units planned for the cluster
superuser_password: password for the operator user
replication_password: password for the user used in the replication
rewind_password: password for the user used on rewinds
tls_enabled: whether TLS is enabled
"""
self.unit_ip = unit_ip
Expand All @@ -87,6 +95,7 @@ def __init__(
self.peers_ips = peers_ips
self.superuser_password = superuser_password
self.replication_password = replication_password
self.rewind_password = rewind_password
self.tls_enabled = tls_enabled
# Variable mapping to requests library verify parameter.
# The CA bundle file is used to validate the server certificate when
Expand Down Expand Up @@ -131,7 +140,11 @@ def _change_owner(self, path: str) -> None:
def cluster_members(self) -> set:
"""Get the current cluster members."""
# Request info from cluster endpoint (which returns all members of the cluster).
cluster_status = requests.get(f"{self._patroni_url}/cluster", verify=self.verify)
cluster_status = requests.get(
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
return set([member["name"] for member in cluster_status.json()["members"]])

def _create_directory(self, path: str, mode: int) -> None:
Expand Down Expand Up @@ -166,7 +179,11 @@ def get_member_ip(self, member_name: str) -> str:
for attempt in Retrying(stop=stop_after_attempt(len(self.peers_ips) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
cluster_status = requests.get(f"{url}/cluster", verify=self.verify, timeout=10)
cluster_status = requests.get(
f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
for member in cluster_status.json()["members"]:
if member["name"] == member_name:
return member["host"]
Expand All @@ -184,7 +201,11 @@ def get_primary(self, unit_name_pattern=False) -> str:
for attempt in Retrying(stop=stop_after_attempt(len(self.peers_ips) + 1)):
with attempt:
url = self._get_alternative_patroni_url(attempt)
cluster_status = requests.get(f"{url}/cluster", verify=self.verify, timeout=10)
cluster_status = requests.get(
f"{url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
for member in cluster_status.json()["members"]:
if member["role"] == "leader":
primary = member["name"]
Expand Down Expand Up @@ -220,7 +241,9 @@ def are_all_members_ready(self) -> bool:
for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(3)):
with attempt:
cluster_status = requests.get(
f"{self._patroni_url}/cluster", verify=self.verify
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
except RetryError:
return False
Expand All @@ -243,12 +266,36 @@ def member_started(self) -> bool:
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
r = requests.get(f"{self._patroni_url}/health", verify=self.verify)
r = requests.get(
f"{self._patroni_url}/health",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
except RetryError:
return False

return r.json()["state"] == "running"

@property
def member_replication_lag(self) -> str:
"""Member replication lag."""
try:
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
cluster_status = requests.get(
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
)
except RetryError:
return "unknown"

for member in cluster_status.json()["members"]:
if member["name"] == self.member_name:
return member["lag"]

return "unknown"

def render_file(self, path: str, content: str, mode: int) -> None:
"""Write a content rendered from a template to a file.

Expand Down Expand Up @@ -296,6 +343,8 @@ def render_patroni_yml_file(self, enable_tls: bool = False) -> None:
superuser=USER,
superuser_password=self.superuser_password,
replication_password=self.replication_password,
rewind_user=REWIND_USER,
rewind_password=self.rewind_password,
version=self._get_postgresql_version(),
)
self.render_file(f"{self.storage_path}/patroni.yml", rendered, 0o644)
Expand Down Expand Up @@ -395,3 +444,8 @@ def restart_patroni(self) -> bool:
def restart_postgresql(self) -> None:
"""Restart PostgreSQL."""
requests.post(f"{self._patroni_url}/restart", verify=self.verify)

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def reinitialize_postgresql(self) -> None:
"""Reinitialize PostgreSQL."""
requests.post(f"{self._patroni_url}/reinitialize", verify=self.verify)
6 changes: 5 additions & 1 deletion src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
LEGACY_DB_ADMIN = "db-admin"
PEER = "database-peers"
ALL_CLIENT_RELATIONS = [DATABASE, LEGACY_DB, LEGACY_DB_ADMIN]
API_REQUEST_TIMEOUT = 5
PATRONI_CLUSTER_STATUS_ENDPOINT = "cluster"
REPLICATION_USER = "replication"
REPLICATION_PASSWORD_KEY = "replication-password"
REWIND_USER = "rewind"
REWIND_PASSWORD_KEY = "rewind-password"
TLS_KEY_FILE = "key.pem"
TLS_CA_FILE = "ca.pem"
TLS_CERT_FILE = "cert.pem"
USER = "operator"
USER_PASSWORD_KEY = "operator-password"
# List of system usernames needed for correct work of the charm/workload.
SYSTEM_USERS = [REPLICATION_USER, USER]
SYSTEM_USERS = [REPLICATION_USER, REWIND_USER, USER]
9 changes: 9 additions & 0 deletions templates/patroni.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ bootstrap:
maximum_lag_on_failover: 1048576
postgresql:
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: true
parameters:
archive_command: /bin/true
archive_mode: on
wal_level: logical

initdb:
- encoding: UTF8
Expand Down Expand Up @@ -80,6 +86,9 @@ postgresql:
replication:
username: replication
password: {{ replication_password }}
rewind:
username: {{ rewind_user }}
password: {{ rewind_password }}
superuser:
username: {{ superuser }}
password: {{ superuser_password }}
Loading