From ecea4c9c82caf33b6d81b603f909dc79e2f5e73d Mon Sep 17 00:00:00 2001 From: Sebastian Utz Date: Fri, 24 May 2024 14:06:36 +0200 Subject: [PATCH] cfr: Add support to export systables to a tarfile --- cratedb_toolkit/cfr/cli.py | 4 +-- cratedb_toolkit/cfr/systable.py | 35 ++++++++++++++++++++++---- tests/cfr/test_cli.py | 44 +++++++++++++++++++++++++++++++-- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/cratedb_toolkit/cfr/cli.py b/cratedb_toolkit/cfr/cli.py index 4aab7450..15dccfd7 100644 --- a/cratedb_toolkit/cfr/cli.py +++ b/cratedb_toolkit/cfr/cli.py @@ -46,8 +46,8 @@ def cli(ctx: click.Context, cratedb_sqlalchemy_url: str, verbose: bool, debug: b def sys_export(ctx: click.Context, target: str): cratedb_sqlalchemy_url = ctx.meta["cratedb_sqlalchemy_url"] try: - stc = SystemTableExporter(dburi=cratedb_sqlalchemy_url, target=path_from_url(target)) - path = stc.save() + stc = SystemTableExporter(dburi=cratedb_sqlalchemy_url) + path = stc.save(path_from_url(target)) jd({"path": str(path)}) except Exception as ex: error_logger(ctx)(ex) diff --git a/cratedb_toolkit/cfr/systable.py b/cratedb_toolkit/cfr/systable.py index ad9f1f0f..ae5f055f 100644 --- a/cratedb_toolkit/cfr/systable.py +++ b/cratedb_toolkit/cfr/systable.py @@ -17,6 +17,9 @@ import datetime as dt import logging +import os +import tarfile +import tempfile import typing as t from pathlib import Path @@ -92,14 +95,12 @@ class SystemTableExporter: Export schema and data from CrateDB system tables. """ - def __init__(self, dburi: str, target: t.Union[Path], data_format: DataFormat = "jsonl"): + def __init__(self, dburi: str, data_format: DataFormat = "jsonl"): self.dburi = dburi - self.target = target self.data_format = data_format self.adapter = DatabaseAdapter(dburi=self.dburi) self.info = InfoContainer(adapter=self.adapter) self.inspector = SystemTableInspector(dburi=self.dburi) - self.target.mkdir(exist_ok=True, parents=True) def read_table(self, tablename: str) -> pl.DataFrame: sql = f'SELECT * FROM "{SystemTableKnowledge.SYS_SCHEMA}"."{tablename}"' # noqa: S608 @@ -122,9 +123,27 @@ def dump_table(self, frame: pl.DataFrame, file: t.Union[t.TextIO, None] = None): else: raise NotImplementedError(f"Output format not implemented: {self.data_format}") - def save(self) -> Path: + def save(self, target: t.Union[Path]) -> Path: + temp_dir = None + if target.name.endswith(".tgz") or target.name.endswith(".tar.gz"): + temp_dir = tempfile.TemporaryDirectory() + target_folder = Path(temp_dir.name) + else: + target_folder = target + target.mkdir(exist_ok=True, parents=True) + + full_path = self.export(target_folder) + + if temp_dir is not None: + self.make_tarfile(target_folder, target) + temp_dir.cleanup() + logger.info(f"Created archive file {target}") + return target + return full_path + + def export(self, target_folder: Path) -> Path: timestamp = dt.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - path = self.target / self.info.cluster_name / timestamp / "sys" + path = target_folder / self.info.cluster_name / timestamp / "sys" logger.info(f"Exporting system tables to: {path}") system_tables = self.inspector.table_names() path_schema = path / ExportSettings.SCHEMA_PATH @@ -166,6 +185,12 @@ def save(self) -> Path: logger.info(f"Successfully exported {table_count} system tables") return path + @staticmethod + def make_tarfile(source_folder: Path, target_file_path: Path) -> Path: + with tarfile.open(target_file_path, "x:gz") as tar: + tar.add(source_folder.absolute(), arcname=os.path.basename(source_folder)) + return target_file_path + class SystemTableImporter: """ diff --git a/tests/cfr/test_cli.py b/tests/cfr/test_cli.py index 3e0648e3..6076181a 100644 --- a/tests/cfr/test_cli.py +++ b/tests/cfr/test_cli.py @@ -1,7 +1,9 @@ import json +import os.path import re import shutil import sys +import tarfile import tests @@ -43,10 +45,48 @@ def test_cfr_cli_export(cratedb, tmp_path, caplog): assert filenames(path) == ["data", "schema"] schema_files = filenames(path / "schema") - data_files = filenames(path / "schema") + data_files = filenames(path / "data") assert len(schema_files) >= 19 - assert len(data_files) >= 19 + assert len(data_files) >= 10 + + +def test_cfr_cli_export_to_archive_file(cratedb, tmp_path, caplog): + """ + Verify `ctk cfr sys-export some-file.tgz` works. + """ + + target = os.path.join(tmp_path, "cluster-data.tgz") + + # Invoke command. + runner = CliRunner(env={"CRATEDB_SQLALCHEMY_URL": cratedb.database.dburi, "CFR_TARGET": str(tmp_path)}) + result = runner.invoke( + cli, + args=f"--debug sys-export {target}", + catch_exceptions=False, + ) + assert result.exit_code == 0 + + # Verify log output. + assert "Exporting system tables to" in caplog.text + assert re.search(r"Successfully exported \d+ system tables", caplog.text), "Log message missing" + + # Verify outcome. + path = Path(json.loads(result.output)["path"]) + assert "cluster-data.tgz" in path.name + + data_files = [] + schema_files = [] + with tarfile.open(path, "r") as tar: + name_list = tar.getnames() + for name in name_list: + if "data" in name: + data_files.append(name) + elif "schema" in name: + schema_files.append(name) + + assert len(schema_files) >= 19 + assert len(data_files) >= 10 def test_cfr_cli_import(cratedb, tmp_path, caplog):