From d4eaa2bba614a7f9ee6d0740d29b2e07871bf61e Mon Sep 17 00:00:00 2001 From: Mykola Marzhan Date: Sat, 11 May 2024 08:31:51 +0200 Subject: [PATCH 01/12] Add Differential backups support --- actions.yaml | 7 +++ src/backups.py | 14 ++++- templates/pgbackrest.conf.j2 | 3 ++ tests/integration/test_backups.py | 85 ++++++++++++++++++++++++++++++- tests/unit/test_backups.py | 16 +++++- 5 files changed, 120 insertions(+), 5 deletions(-) diff --git a/actions.yaml b/actions.yaml index ba46f1108b..a2d7d41c6f 100644 --- a/actions.yaml +++ b/actions.yaml @@ -3,6 +3,13 @@ create-backup: description: Creates a backup to s3 storage in AWS. + params: + type: + type: string + description: The backup type, the default value is 'full'. + Full backup is a full copy of all data. + Differential backup is a copy only of changed data since the last full backup. + Possible values - full, differential. get-primary: description: Get the unit with is the primary/leader in the replication. get-password: diff --git a/src/backups.py b/src/backups.py index 952658e736..1046215ee6 100644 --- a/src/backups.py +++ b/src/backups.py @@ -454,8 +454,18 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent): self._initialise_stanza() - def _on_create_backup_action(self, event) -> None: + def _on_create_backup_action(self, event) -> None: # noqa: C901 """Request that pgBackRest creates a backup.""" + backup_type = event.params.get("type", "full").lower()[:4] + if backup_type not in ["full", "diff"]: + error_message = ( + f"Invalid backup type: {backup_type}. Possible values: full, differential." + ) + logger.error(f"Backup failed: {error_message}") + event.fail(error_message) + return + + logger.info(f"A {backup_type} backup has been requested on unit") can_unit_perform_backup, validation_message = self._can_unit_perform_backup() if not can_unit_perform_backup: logger.error(f"Backup failed: {validation_message}") @@ -502,7 +512,7 @@ def _on_create_backup_action(self, event) -> None: "pgbackrest", f"--stanza={self.stanza_name}", "--log-level-console=debug", - "--type=full", + f"--type={backup_type}", "backup", ] if self.charm.is_primary: diff --git a/templates/pgbackrest.conf.j2 b/templates/pgbackrest.conf.j2 index 3c2349bf4f..40e540107f 100644 --- a/templates/pgbackrest.conf.j2 +++ b/templates/pgbackrest.conf.j2 @@ -1,6 +1,7 @@ [global] backup-standby=y repo1-retention-full=9999999 +repo1-retention-history=365 repo1-type=s3 repo1-path={{ path }} repo1-s3-region={{ region }} @@ -9,6 +10,8 @@ repo1-s3-bucket={{ bucket }} repo1-s3-uri-style={{ s3_uri_style }} repo1-s3-key={{ access_key }} repo1-s3-key-secret={{ secret_key }} +repo1-block=y +repo1-bundle=y start-fast=y {%- if enable_tls %} tls-server-address=* diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 261f5fff05..27aabe49cf 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -156,7 +156,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, action = await ops_test.model.units.get(replica).run_action("list-backups") await action.wait() backups = action.results.get("backups") - assert backups, "backups not outputted" + assert len(backups.split("\n")) == 1, "full backup is not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) # Write some data. @@ -166,11 +166,85 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, connection.cursor().execute("CREATE TABLE backup_table_2 (test_collumn INT );") connection.close() + # Run the "create backup" action. + logger.info("creating a backup") + action = await ops_test.model.units.get(replica).run_action( + "create-backup", **{"type": "diff"} + ) + await action.wait() + backup_status = action.results.get("backup-status") + assert backup_status, "backup hasn't succeeded" + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Run the "list backups" action. + logger.info("listing the available backups") + action = await ops_test.model.units.get(replica).run_action("list-backups") + await action.wait() + backups = action.results.get("backups") + assert len(backups.split("\n")) == 2, "differential backup is not outputted" + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Write some data. + logger.info("creating a second table in the database") + with db_connect(host=address, password=password) as connection: + connection.autocommit = True + connection.cursor().execute("CREATE TABLE backup_table_3 (test_collumn INT );") + connection.close() # Scale down to be able to restore. async with ops_test.fast_forward(fast_interval="60s"): await scale_application(ops_test, database_app_name, 1) - # Run the "restore backup" action. + # Run the "restore backup" action for differential backup. + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + most_recent_backup = backups.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" + + # Wait for the restore to complete. + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle(status="active", timeout=1000) + + # Check that the backup was correctly restored by having only the first created table. + logger.info("checking that the backup was correctly restored") + primary = await get_primary(ops_test, database_app_name) + address = await get_unit_address(ops_test, primary) + with db_connect( + host=address, password=password + ) as connection, connection.cursor() as cursor: + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_1');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_1' doesn't exist" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_2');" + ) + assert cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_2' doesn't exist" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_3' exists" + connection.close() + + # Run the "restore backup" action for full backup. for attempt in Retrying( stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) ): @@ -210,6 +284,13 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, assert not cursor.fetchone()[ 0 ], "backup wasn't correctly restored: table 'backup_table_2' exists" + cursor.execute( + "SELECT EXISTS (SELECT FROM information_schema.tables" + " WHERE table_schema = 'public' AND table_name = 'backup_table_3');" + ) + assert not cursor.fetchone()[ + 0 + ], "backup wasn't correctly restored: table 'backup_table_3' exists" connection.close() # Run the following steps only in one cloud (it's enough for those checks). diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 356505a7e1..9bd19a86a1 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -1064,8 +1064,16 @@ def test_on_create_backup_action(harness): patch("charm.PostgreSQLBackups._retrieve_s3_parameters") as _retrieve_s3_parameters, patch("charm.PostgreSQLBackups._can_unit_perform_backup") as _can_unit_perform_backup, ): - # Test when the unit cannot perform a backup. + # Test when the unit cannot perform a backup because of type. mock_event = MagicMock() + mock_event.params = {"type": "wrong"} + harness.charm.backup._on_create_backup_action(mock_event) + mock_event.fail.assert_called_once() + mock_event.set_results.assert_not_called() + + # Test when the unit cannot perform a backup because of preflight check. + mock_event = MagicMock() + mock_event.params = {"type": "full"} _can_unit_perform_backup.return_value = (False, "fake validation message") harness.charm.backup._on_create_backup_action(mock_event) mock_event.fail.assert_called_once() @@ -1073,6 +1081,7 @@ def test_on_create_backup_action(harness): # Test when the charm fails to upload a file to S3. mock_event.reset_mock() + mock_event.params = {"type": "full"} _can_unit_perform_backup.return_value = (True, None) mock_s3_parameters = { "bucket": "test-bucket", @@ -1106,6 +1115,7 @@ def test_on_create_backup_action(harness): # Test when the backup fails. mock_event.reset_mock() + mock_event.params = {"type": "full"} _upload_content_to_s3.return_value = True _is_primary.return_value = True _execute_command.side_effect = ExecError( @@ -1122,12 +1132,14 @@ def test_on_create_backup_action(harness): # Test when the backup succeeds but the charm fails to upload the backup logs. mock_event.reset_mock() + mock_event.params = {"type": "full"} _upload_content_to_s3.reset_mock() _upload_content_to_s3.side_effect = [True, False] _execute_command.side_effect = None _execute_command.return_value = "fake stdout", "fake stderr" _list_backups.return_value = {"2023-01-01T09:00:00Z": harness.charm.backup.stanza_name} _update_config.reset_mock() + mock_event.params = {"type": "full"} harness.charm.backup._on_create_backup_action(mock_event) _upload_content_to_s3.assert_has_calls([ call( @@ -1147,6 +1159,7 @@ def test_on_create_backup_action(harness): # Test when the backup succeeds (including the upload of the backup logs). mock_event.reset_mock() + mock_event.params = {"type": "full"} _upload_content_to_s3.reset_mock() _upload_content_to_s3.side_effect = None _upload_content_to_s3.return_value = True @@ -1171,6 +1184,7 @@ def test_on_create_backup_action(harness): # Test when this unit is a replica (the connectivity to the database should be changed). mock_event.reset_mock() + mock_event.params = {"type": "full"} _upload_content_to_s3.reset_mock() _is_primary.return_value = False harness.charm.backup._on_create_backup_action(mock_event) From 2d32d4f99c4b6d20c3f09812941fe47b6376b1d8 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 08:21:59 +0000 Subject: [PATCH 02/12] adapt naame mapping logic --- src/backups.py | 56 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/backups.py b/src/backups.py index 1046215ee6..ea903c91a6 100644 --- a/src/backups.py +++ b/src/backups.py @@ -255,17 +255,15 @@ def _generate_backup_list_output(self) -> str: output, _ = self._execute_command(["pgbackrest", "info", "--output=json"]) backups = json.loads(output)[0]["backup"] for backup in backups: - backup_id = datetime.strftime( - datetime.strptime(backup["label"][:-1], "%Y%m%d-%H%M%S"), "%Y-%m-%dT%H:%M:%SZ" - ) + backup_id, backup_type = self._parse_backup_id(backup["label"]) error = backup["error"] backup_status = "finished" if error: backup_status = f"failed: {error}" - backup_list.append((backup_id, "physical", backup_status)) + backup_list.append((backup_id, backup_type, backup_status)) return self._format_backup_list(backup_list) - def _list_backups(self, show_failed: bool) -> OrderedDict[str, str]: + def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: """Retrieve the list of backups. Args: @@ -286,15 +284,29 @@ def _list_backups(self, show_failed: bool) -> OrderedDict[str, str]: stanza_name = repository_info["name"] return OrderedDict[str, str]( ( - datetime.strftime( - datetime.strptime(backup["label"][:-1], "%Y%m%d-%H%M%S"), "%Y-%m-%dT%H:%M:%SZ" - ), + self._parse_backup_id(backup["label"])[0] if parse else backup["label"], stanza_name, ) for backup in backups if show_failed or not backup["error"] ) + def _parse_backup_id(self, label) -> Tuple[str, str]: + """parse backup ID as a timestamp""" + if label[-1] == "F": + timestamp = label + backup_type = "full" + elif label[-1] == "D": + timestamp = label.split("_")[1] + backup_type = "differential" + else: + raise ValueError("Unknown label format for backup ID: %s", label) + + return (datetime.strftime( + datetime.strptime(timestamp[:-1], "%Y%m%d-%H%M%S"), "%Y-%m-%dT%H:%M:%SZ" + ), backup_type) + + def _initialise_stanza(self) -> None: """Initialize the stanza. @@ -533,7 +545,7 @@ def _on_create_backup_action(self, event) -> None: # noqa: C901 else: # Generate a backup id from the current date and time if the backup failed before # generating the backup label (our backup id). - backup_id = datetime.strftime(datetime.now(), "%Y%m%d-%H%M%SF") + backup_id = self._generate_fake_backup_id(backup_type) # Upload the logs to S3. logs = f"""Stdout: @@ -674,7 +686,7 @@ def _on_restore_action(self, event): # Mark the cluster as in a restoring backup state and update the Patroni configuration. logger.info("Configuring Patroni to restore the backup") self.charm.app_peer_data.update({ - "restoring-backup": f'{datetime.strftime(datetime.strptime(backup_id, "%Y-%m-%dT%H:%M:%SZ"), "%Y%m%d-%H%M%S")}F', + "restoring-backup": self._fetch_backup_from_id(backup_id), "restore-stanza": backups[backup_id], }) self.charm.update_config() @@ -685,6 +697,30 @@ def _on_restore_action(self, event): event.set_results({"restore-status": "restore started"}) + def _generate_fake_backup_id(self, backup_type: str) -> str: + if backup_type == "F": + return datetime.strftime(datetime.now(), "%Y%m%d-%H%M%SF") + if backup_type == "D": + backups = self._list_backups(show_failed=False, parse=False).keys() + last_full_backup = None + for label in backups[::-1]: + if label.endswith("F"): + last_full_backup = label + break + + if last_full_backup is None: + raise TypeError("Differential backup requested but no previous full backup") + return f'{last_full_backup}_{datetime.strftime(datetime.now(), "%Y%m%d-%H%M%SD")}' + + def _fetch_backup_from_id(self, backup_id: str) -> str: + timestamp = f'{datetime.strftime(datetime.strptime(backup_id, "%Y-%m-%dT%H:%M:%SZ"), "%Y%m%d-%H%M%S")}' + backups = self._list_backups(show_failed=False, parse=False).keys() + for label in backups: + if timestamp in label: + return label + + return None + def _pre_restore_checks(self, event: ActionEvent) -> bool: """Run some checks before starting the restore. From e94f439087795e775a335489a510712987d69431 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 08:25:35 +0000 Subject: [PATCH 03/12] fix linting --- src/backups.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/backups.py b/src/backups.py index ea903c91a6..911d1c3f86 100644 --- a/src/backups.py +++ b/src/backups.py @@ -268,6 +268,7 @@ def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: Args: show_failed: whether to also return the failed backups. + parse: whether to convert backup labels to their IDs or not. Returns: a dict of previously created backups (id + stanza name) or an empty list @@ -292,7 +293,7 @@ def _list_backups(self, show_failed: bool, parse=True) -> OrderedDict[str, str]: ) def _parse_backup_id(self, label) -> Tuple[str, str]: - """parse backup ID as a timestamp""" + """Parse backup ID as a timestamp.""" if label[-1] == "F": timestamp = label backup_type = "full" @@ -301,11 +302,13 @@ def _parse_backup_id(self, label) -> Tuple[str, str]: backup_type = "differential" else: raise ValueError("Unknown label format for backup ID: %s", label) - - return (datetime.strftime( - datetime.strptime(timestamp[:-1], "%Y%m%d-%H%M%S"), "%Y-%m-%dT%H:%M:%SZ" - ), backup_type) - + + return ( + datetime.strftime( + datetime.strptime(timestamp[:-1], "%Y%m%d-%H%M%S"), "%Y-%m-%dT%H:%M:%SZ" + ), + backup_type, + ) def _initialise_stanza(self) -> None: """Initialize the stanza. From 0b2425d28f8fe8d16de9593342083b9d0ce49f15 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 09:41:10 +0000 Subject: [PATCH 04/12] fix unit tests --- tests/unit/test_backups.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_backups.py b/tests/unit/test_backups.py index 9bd19a86a1..fec7cf307a 100644 --- a/tests/unit/test_backups.py +++ b/tests/unit/test_backups.py @@ -507,15 +507,15 @@ def test_format_backup_list(harness): # Test when there are backups. backup_list = [ - ("2023-01-01T09:00:00Z", "physical", "failed: fake error"), - ("2023-01-01T10:00:00Z", "physical", "finished"), + ("2023-01-01T09:00:00Z", "full", "failed: fake error"), + ("2023-01-01T10:00:00Z", "full", "finished"), ] tc.assertEqual( harness.charm.backup._format_backup_list(backup_list), """backup-id | backup-type | backup-status ---------------------------------------------------- -2023-01-01T09:00:00Z | physical | failed: fake error -2023-01-01T10:00:00Z | physical | finished""", +2023-01-01T09:00:00Z | full | failed: fake error +2023-01-01T10:00:00Z | full | finished""", ) @@ -538,8 +538,8 @@ def test_generate_backup_list_output(harness): harness.charm.backup._generate_backup_list_output(), """backup-id | backup-type | backup-status ---------------------------------------------------- -2023-01-01T09:00:00Z | physical | failed: fake error -2023-01-01T10:00:00Z | physical | finished""", +2023-01-01T09:00:00Z | full | failed: fake error +2023-01-01T10:00:00Z | full | finished""", ) @@ -1238,15 +1238,15 @@ def test_on_list_backups_action(harness): _generate_backup_list_output.side_effect = None _generate_backup_list_output.return_value = """backup-id | backup-type | backup-status ---------------------------------------------------- -2023-01-01T09:00:00Z | physical | failed: fake error -2023-01-01T10:00:00Z | physical | finished""" +2023-01-01T09:00:00Z | full | failed: fake error +2023-01-01T10:00:00Z | full | finished""" harness.charm.backup._on_list_backups_action(mock_event) _generate_backup_list_output.assert_called_once() mock_event.set_results.assert_called_once_with({ "backups": """backup-id | backup-type | backup-status ---------------------------------------------------- -2023-01-01T09:00:00Z | physical | failed: fake error -2023-01-01T10:00:00Z | physical | finished""" +2023-01-01T09:00:00Z | full | failed: fake error +2023-01-01T10:00:00Z | full | finished""" }) mock_event.fail.assert_not_called() @@ -1261,6 +1261,7 @@ def test_on_restore_action(harness): patch("lightkube.Client.delete") as _delete, patch("ops.model.Container.stop") as _stop, patch("charm.PostgreSQLBackups._list_backups") as _list_backups, + patch("charm.PostgreSQLBackups._fetch_backup_from_id") as _fetch_backup_from_id, patch("charm.PostgreSQLBackups._pre_restore_checks") as _pre_restore_checks, ): peer_rel_id = harness.model.get_relation(PEER).id @@ -1288,6 +1289,7 @@ def test_on_restore_action(harness): harness.charm.unit.status = ActiveStatus() harness.charm.backup._on_restore_action(mock_event) _list_backups.assert_called_once_with(show_failed=False) + _fetch_backup_from_id.assert_not_called() mock_event.fail.assert_called_once() _stop.assert_not_called() _delete.assert_not_called() @@ -1362,6 +1364,7 @@ def test_on_restore_action(harness): mock_event.reset_mock() _restart_database.reset_mock() _empty_data_files.side_effect = None + _fetch_backup_from_id.return_value = "20230101-090000F" tc.assertEqual(harness.get_relation_data(peer_rel_id, harness.charm.app), {}) harness.charm.backup._on_restore_action(mock_event) _restart_database.assert_not_called() From 1235751e8dff00d22c457135c71ac3d3905edd4a Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 13:26:15 +0000 Subject: [PATCH 05/12] add sugestions + adapt integration tests --- src/backups.py | 3 +++ tests/integration/test_backups.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backups.py b/src/backups.py index 911d1c3f86..51f8d4e7be 100644 --- a/src/backups.py +++ b/src/backups.py @@ -300,6 +300,9 @@ def _parse_backup_id(self, label) -> Tuple[str, str]: elif label[-1] == "D": timestamp = label.split("_")[1] backup_type = "differential" + elif label[-1] == "I": + timestamp = label.split("_")[1] + backup_type = "incremental" else: raise ValueError("Unknown label format for backup ID: %s", label) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 27aabe49cf..f90c017216 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -156,7 +156,8 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, action = await ops_test.model.units.get(replica).run_action("list-backups") await action.wait() backups = action.results.get("backups") - assert len(backups.split("\n")) == 1, "full backup is not outputted" + # two lines for header output, one backup line ==> 3 total lines + assert len(backups.split("\n")) == 3, "full backup is not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) # Write some data. @@ -182,7 +183,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, action = await ops_test.model.units.get(replica).run_action("list-backups") await action.wait() backups = action.results.get("backups") - assert len(backups.split("\n")) == 2, "differential backup is not outputted" + assert len(backups.split("\n")) == 4, "differential backup is not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) # Write some data. From e3164f54d3320c55db78d01e524dd6c767ed8a3f Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 14:11:46 +0000 Subject: [PATCH 06/12] pick correct backup in integration test --- tests/integration/test_backups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index f90c017216..dc70343ade 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -251,7 +251,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, ): with attempt: logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] + most_recent_backup = backups.split("\n")[-2] backup_id = most_recent_backup.split()[0] action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( "restore", **{"backup-id": backup_id} From c3350e10d887735f5c05663a065dec98506a10a0 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 19:52:07 +0000 Subject: [PATCH 07/12] add docstrings + fine tuning tests --- src/backups.py | 2 + tests/integration/test_backups.py | 77 ++++++++++++++----------------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/src/backups.py b/src/backups.py index 51f8d4e7be..b834aecda6 100644 --- a/src/backups.py +++ b/src/backups.py @@ -704,6 +704,7 @@ def _on_restore_action(self, event): event.set_results({"restore-status": "restore started"}) def _generate_fake_backup_id(self, backup_type: str) -> str: + """Creates a backup id for failed backup operations (to store log file).""" if backup_type == "F": return datetime.strftime(datetime.now(), "%Y%m%d-%H%M%SF") if backup_type == "D": @@ -719,6 +720,7 @@ def _generate_fake_backup_id(self, backup_type: str) -> str: return f'{last_full_backup}_{datetime.strftime(datetime.now(), "%Y%m%d-%H%M%SD")}' def _fetch_backup_from_id(self, backup_id: str) -> str: + """Fetches backup's pgbackrest label from backup id.""" timestamp = f'{datetime.strftime(datetime.strptime(backup_id, "%Y-%m-%dT%H:%M:%SZ"), "%Y%m%d-%H%M%S")}' backups = self._list_backups(show_failed=False, parse=False).keys() for label in backups: diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index dc70343ade..8655d6ed9b 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -109,8 +109,6 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, await build_and_deploy( ops_test, 2, database_app_name=database_app_name, wait_for_idle=False ) - await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) - await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME) # Configure and set access and secret keys. logger.info(f"configuring S3 integrator for {cloud}") @@ -120,6 +118,10 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, **cloud_configs[1][cloud], ) await action.wait() + + await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME) + await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) + async with ops_test.fast_forward(fast_interval="60s"): await ops_test.model.wait_for_idle( apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 @@ -156,7 +158,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, action = await ops_test.model.units.get(replica).run_action("list-backups") await action.wait() backups = action.results.get("backups") - # two lines for header output, one backup line ==> 3 total lines + # 2 lines for header output, 1 backup line ==> 3 total lines assert len(backups.split("\n")) == 3, "full backup is not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) @@ -183,6 +185,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, action = await ops_test.model.units.get(replica).run_action("list-backups") await action.wait() backups = action.results.get("backups") + # 2 lines for header output, 2 backup lines ==> 4 total lines assert len(backups.split("\n")) == 4, "differential backup is not outputted" await ops_test.model.wait_for_idle(status="active", timeout=1000) @@ -197,19 +200,15 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, await scale_application(ops_test, database_app_name, 1) # Run the "restore backup" action for differential backup. - for attempt in Retrying( - stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) - ): - with attempt: - logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + logger.info("restoring the backup") + last_diff_backup = backups.split("\n")[-1] + backup_id = last_diff_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): @@ -246,19 +245,15 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, connection.close() # Run the "restore backup" action for full backup. - for attempt in Retrying( - stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) - ): - with attempt: - logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-2] - backup_id = most_recent_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + logger.info("restoring the backup") + last_full_backup = backups.split("\n")[-2] + backup_id = last_full_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): @@ -307,7 +302,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, ) # Scale up to be able to test primary and leader being different. - async with ops_test.fast_forward(): + async with ops_test.fast_forward(fast_interval="60s"): await scale_application(ops_test, database_app_name, 2) logger.info("ensuring that the replication is working correctly") @@ -411,19 +406,15 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None ) # Run the "restore backup" action. - for attempt in Retrying( - stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) - ): - with attempt: - logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + logger.info("restoring the backup") + most_recent_backup = backups.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): From b58c52b85bb9000a9f54054df8022b1c240a8677 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 20:52:04 +0000 Subject: [PATCH 08/12] make model deployment wait for idle before relate --- tests/integration/test_backups.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index 8655d6ed9b..f476324b50 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -107,7 +107,7 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, # (to be able to create backups from replicas). database_app_name = f"{DATABASE_APP_NAME}-{cloud.lower()}" await build_and_deploy( - ops_test, 2, database_app_name=database_app_name, wait_for_idle=False + ops_test, 2, database_app_name=database_app_name, wait_for_idle=True ) # Configure and set access and secret keys. @@ -360,9 +360,9 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None previous_database_app_name = f"{DATABASE_APP_NAME}-gcp" database_app_name = f"new-{DATABASE_APP_NAME}" await build_and_deploy( - ops_test, 1, database_app_name=previous_database_app_name, wait_for_idle=False + ops_test, 1, database_app_name=previous_database_app_name, wait_for_idle=True ) - await build_and_deploy(ops_test, 1, database_app_name=database_app_name, wait_for_idle=False) + await build_and_deploy(ops_test, 1, database_app_name=database_app_name, wait_for_idle=True) await ops_test.model.relate(previous_database_app_name, S3_INTEGRATOR_APP_NAME) await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) async with ops_test.fast_forward(): From 30167093b92b9216947065c9a9b6bc5a0f2ece1b Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Wed, 15 May 2024 22:00:06 +0000 Subject: [PATCH 09/12] revert fine-tune changes --- tests/integration/test_backups.py | 72 ++++++++++++++++++------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/tests/integration/test_backups.py b/tests/integration/test_backups.py index f476324b50..1a27548512 100644 --- a/tests/integration/test_backups.py +++ b/tests/integration/test_backups.py @@ -109,6 +109,8 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, await build_and_deploy( ops_test, 2, database_app_name=database_app_name, wait_for_idle=True ) + await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) + await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME) # Configure and set access and secret keys. logger.info(f"configuring S3 integrator for {cloud}") @@ -118,10 +120,6 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, **cloud_configs[1][cloud], ) await action.wait() - - await ops_test.model.relate(database_app_name, TLS_CERTIFICATES_APP_NAME) - await ops_test.model.relate(database_app_name, S3_INTEGRATOR_APP_NAME) - async with ops_test.fast_forward(fast_interval="60s"): await ops_test.model.wait_for_idle( apps=[database_app_name, S3_INTEGRATOR_APP_NAME], status="active", timeout=1000 @@ -200,15 +198,19 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, await scale_application(ops_test, database_app_name, 1) # Run the "restore backup" action for differential backup. - logger.info("restoring the backup") - last_diff_backup = backups.split("\n")[-1] - backup_id = last_diff_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + last_diff_backup = backups.split("\n")[-1] + backup_id = last_diff_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): @@ -245,15 +247,19 @@ async def test_backup_and_restore(ops_test: OpsTest, cloud_configs: Tuple[Dict, connection.close() # Run the "restore backup" action for full backup. - logger.info("restoring the backup") - last_full_backup = backups.split("\n")[-2] - backup_id = last_full_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + last_full_backup = backups.split("\n")[-2] + backup_id = last_full_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): @@ -406,15 +412,19 @@ async def test_restore_on_new_cluster(ops_test: OpsTest, github_secrets) -> None ) # Run the "restore backup" action. - logger.info("restoring the backup") - most_recent_backup = backups.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( - "restore", **{"backup-id": backup_id} - ) - await action.wait() - restore_status = action.results.get("restore-status") - assert restore_status, "restore hasn't succeeded" + for attempt in Retrying( + stop=stop_after_attempt(10), wait=wait_exponential(multiplier=1, min=2, max=30) + ): + with attempt: + logger.info("restoring the backup") + most_recent_backup = backups.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await ops_test.model.units.get(f"{database_app_name}/0").run_action( + "restore", **{"backup-id": backup_id} + ) + await action.wait() + restore_status = action.results.get("restore-status") + assert restore_status, "restore hasn't succeeded" # Wait for the restore to complete. async with ops_test.fast_forward(): From 3f50b1c220951456192e4671e92776f8f9fffc28 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Thu, 16 May 2024 07:49:26 +0000 Subject: [PATCH 10/12] reload patroni after restart --- src/charm.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/charm.py b/src/charm.py index 09ba21a1b0..8d9cad5aa4 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1428,6 +1428,13 @@ def _restart(self, event: RunWithLock) -> None: self.unit.status = BlockedStatus(error_message) return + try: + self._patroni.reload_patroni_configuration() + except RetryError: + error_message = "failed to restart patroni, exceeded number of retries" + logger.exception(error_message) + pass + # Update health check URL. self._update_pebble_layers() @@ -1551,19 +1558,23 @@ def _validate_config_options(self) -> None: def _handle_postgresql_restart_need(self): """Handle PostgreSQL restart need based on the TLS configuration and configuration changes.""" restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled() - self._patroni.reload_patroni_configuration() - # Wait for some more time than the Patroni's loop_wait default value (10 seconds), - # which tells how much time Patroni will wait before checking the configuration - # file again to reload it. - try: - for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)): - with attempt: - restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending() - if not restart_postgresql: - raise Exception - except RetryError: - # Ignore the error, as it happens only to indicate that the configuration has not changed. - pass + if not restart_postgresql: + self._patroni.reload_patroni_configuration() + # Wait for some more time than the Patroni's loop_wait default value (10 seconds), + # which tells how much time Patroni will wait before checking the configuration + # file again to reload it. + try: + for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)): + with attempt: + restart_postgresql = ( + restart_postgresql or self.postgresql.is_restart_pending() + ) + if not restart_postgresql: + raise Exception + except RetryError: + # Ignore the error, as it happens only to indicate that the configuration has not changed. + pass + self.unit_peer_data.update({"tls": "enabled" if self.is_tls_enabled else ""}) # Restart PostgreSQL if TLS configuration has changed From ff33028f244db14d7c2480f7cfcc8f75eec3fc43 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Thu, 16 May 2024 07:53:24 +0000 Subject: [PATCH 11/12] fix unit test --- tests/unit/test_charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 2b7e7d2f03..96237c59e5 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1432,7 +1432,6 @@ def test_handle_postgresql_restart_need(harness): postgresql_mock.is_restart_pending = PropertyMock(return_value=values[2]) harness.charm._handle_postgresql_restart_need() - _reload_patroni_configuration.assert_called_once() ( tc.assertIn("tls", harness.get_relation_data(rel_id, harness.charm.unit)) if values[0] @@ -1444,6 +1443,7 @@ def test_handle_postgresql_restart_need(harness): else: _generate_metrics_jobs.assert_not_called() _restart.assert_not_called() + _reload_patroni_configuration.assert_called_once() def test_set_active_status(harness): From eaca98078316d8c5010e4fdd06245b541d17cd67 Mon Sep 17 00:00:00 2001 From: Lucas Gameiro Borges Date: Thu, 16 May 2024 08:30:38 +0000 Subject: [PATCH 12/12] revert last 2 commits --- src/charm.py | 37 +++++++++++++------------------------ tests/unit/test_charm.py | 2 +- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/src/charm.py b/src/charm.py index 8d9cad5aa4..09ba21a1b0 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1428,13 +1428,6 @@ def _restart(self, event: RunWithLock) -> None: self.unit.status = BlockedStatus(error_message) return - try: - self._patroni.reload_patroni_configuration() - except RetryError: - error_message = "failed to restart patroni, exceeded number of retries" - logger.exception(error_message) - pass - # Update health check URL. self._update_pebble_layers() @@ -1558,23 +1551,19 @@ def _validate_config_options(self) -> None: def _handle_postgresql_restart_need(self): """Handle PostgreSQL restart need based on the TLS configuration and configuration changes.""" restart_postgresql = self.is_tls_enabled != self.postgresql.is_tls_enabled() - if not restart_postgresql: - self._patroni.reload_patroni_configuration() - # Wait for some more time than the Patroni's loop_wait default value (10 seconds), - # which tells how much time Patroni will wait before checking the configuration - # file again to reload it. - try: - for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)): - with attempt: - restart_postgresql = ( - restart_postgresql or self.postgresql.is_restart_pending() - ) - if not restart_postgresql: - raise Exception - except RetryError: - # Ignore the error, as it happens only to indicate that the configuration has not changed. - pass - + self._patroni.reload_patroni_configuration() + # Wait for some more time than the Patroni's loop_wait default value (10 seconds), + # which tells how much time Patroni will wait before checking the configuration + # file again to reload it. + try: + for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(3)): + with attempt: + restart_postgresql = restart_postgresql or self.postgresql.is_restart_pending() + if not restart_postgresql: + raise Exception + except RetryError: + # Ignore the error, as it happens only to indicate that the configuration has not changed. + pass self.unit_peer_data.update({"tls": "enabled" if self.is_tls_enabled else ""}) # Restart PostgreSQL if TLS configuration has changed diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 96237c59e5..2b7e7d2f03 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -1432,6 +1432,7 @@ def test_handle_postgresql_restart_need(harness): postgresql_mock.is_restart_pending = PropertyMock(return_value=values[2]) harness.charm._handle_postgresql_restart_need() + _reload_patroni_configuration.assert_called_once() ( tc.assertIn("tls", harness.get_relation_data(rel_id, harness.charm.unit)) if values[0] @@ -1443,7 +1444,6 @@ def test_handle_postgresql_restart_need(harness): else: _generate_metrics_jobs.assert_not_called() _restart.assert_not_called() - _reload_patroni_configuration.assert_called_once() def test_set_active_status(harness):