Skip to content

Commit

Permalink
my.core.structure: add support for .tar.gz archives
Browse files Browse the repository at this point in the history
this will be useful to migrate .tar.gz processing to kompress in a backwards compatible way, or to run them against unpacked folder structure if user prefers
  • Loading branch information
karlicoss committed Sep 16, 2024
1 parent 27178c0 commit d3019bc
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 21 deletions.
44 changes: 26 additions & 18 deletions my/core/structure.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import atexit
import os
import shutil
import tarfile
import tempfile
import zipfile
from contextlib import contextmanager
Expand Down Expand Up @@ -34,6 +35,7 @@ def _structure_exists(base_dir: Path, paths: Sequence[str], *, partial: bool = F


ZIP_EXT = {".zip"}
TARGZ_EXT = {".tar.gz"}


@contextmanager
Expand All @@ -44,20 +46,20 @@ def match_structure(
partial: bool = False,
) -> Generator[Tuple[Path, ...], None, None]:
"""
Given a 'base' directory or zipfile, recursively search for one or more paths that match the
Given a 'base' directory or archive (zip/tar.gz), recursively search for one or more paths that match the
pattern described in 'expected'. That can be a single string, or a list
of relative paths (as strings) you expect at the same directory.
If 'partial' is True, it only requires that one of the items in
expected be present, not all of them.
This reduces the chances of the user misconfiguring gdpr exports, e.g.
if they zipped the folders instead of the parent directory or vice-versa
if they archived the folders instead of the parent directory or vice-versa
When this finds a matching directory structure, it stops searching in that subdirectory
and continues onto other possible subdirectories which could match
If base is a zipfile, this extracts the zipfile into a temporary directory
If base is an archive, this extracts it into a temporary directory
(configured by core_config.config.get_tmp_dir), and then searches the extracted
folder for matching structures
Expand Down Expand Up @@ -93,12 +95,12 @@ def match_structure(
This doesn't require an exhaustive list of expected values, but its a good idea to supply
a complete picture of the expected structure to avoid false-positives
This does not recursively unzip zipfiles in the subdirectories,
it only unzips into a temporary directory if 'base' is a zipfile
This does not recursively decompress archives in the subdirectories,
it only unpacks into a temporary directory if 'base' is an archive
A common pattern for using this might be to use get_files to get a list
of zipfiles or top-level gdpr export directories, and use match_structure
to search the resulting paths for a export structure you're expecting
of archives or top-level gdpr export directories, and use match_structure
to search the resulting paths for an export structure you're expecting
"""
from . import core_config as CC

Expand All @@ -108,26 +110,32 @@ def match_structure(
expected = (expected,)

is_zip: bool = base.suffix in ZIP_EXT
is_targz: bool = any(base.name.endswith(suffix) for suffix in TARGZ_EXT)

searchdir: Path = base.absolute()
try:
# if the file given by the user is a zipfile, create a temporary
# directory and extract the zipfile to that temporary directory
# if the file given by the user is an archive, create a temporary
# directory and extract it to that temporary directory
#
# this temporary directory is removed in the finally block
if is_zip:
if is_zip or is_targz:
# sanity check before we start creating directories/rm-tree'ing things
assert base.exists(), f"zipfile at {base} doesn't exist"
assert base.exists(), f"archive at {base} doesn't exist"

searchdir = Path(tempfile.mkdtemp(dir=tdir))

# base might already be a ZipPath, and str(base) would end with /
zf = zipfile.ZipFile(str(base).rstrip('/'))
zf.extractall(path=str(searchdir))

if is_zip:
# base might already be a ZipPath, and str(base) would end with /
zf = zipfile.ZipFile(str(base).rstrip('/'))
zf.extractall(path=str(searchdir))
elif is_targz:
with tarfile.open(str(base)) as tar:
tar.extractall(path=str(searchdir), filter='data')
else:
raise RuntimeError("can't happen")
else:
if not searchdir.is_dir():
raise NotADirectoryError(f"Expected either a zipfile or a directory, received {searchdir}")
raise NotADirectoryError(f"Expected either a zip/tar.gz archive or a directory, received {searchdir}")

matches: List[Path] = []
possible_targets: List[Path] = [searchdir]
Expand All @@ -150,9 +158,9 @@ def match_structure(

finally:

if is_zip:
if is_zip or is_targz:
# make sure we're not mistakenly deleting data
assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting zip to start with the temporary directory prefix ({tdir}), found {searchdir}"
assert str(searchdir).startswith(str(tdir)), f"Expected the temporary directory for extracting archive to start with the temporary directory prefix ({tdir}), found {searchdir}"

shutil.rmtree(str(searchdir))

Expand Down
7 changes: 4 additions & 3 deletions my/core/tests/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ def test_gdpr_structure_exists() -> None:
assert results == (structure_data / "gdpr_subdirs" / "gdpr_export",)


def test_gdpr_unzip() -> None:
with match_structure(structure_data / "gdpr_export.zip", expected=gdpr_expected) as results:
@pytest.mark.parametrize("archive", ["gdpr_export.zip", "gdpr_export.tar.gz"])
def test_gdpr_unpack(archive: str) -> None:
with match_structure(structure_data / archive, expected=gdpr_expected) as results:
assert len(results) == 1
extracted = results[0]
index_file = extracted / "messages" / "index.csv"
Expand All @@ -32,6 +33,6 @@ def test_match_partial() -> None:


def test_not_directory() -> None:
with pytest.raises(NotADirectoryError, match=r"Expected either a zipfile or a directory"):
with pytest.raises(NotADirectoryError, match=r"Expected either a zip/tar.gz archive or a directory"):
with match_structure(structure_data / "messages/index.csv", expected=gdpr_expected):
pass
Binary file added my/core/tests/structure_data/gdpr_export.tar.gz
Binary file not shown.

0 comments on commit d3019bc

Please sign in to comment.