PBENCH-1014

Using Tarball.extract in Inventory API to extract file content from tarball.
distributed-system-analysis · Jun 16, 2023 · 177c1c6 · 177c1c6
1 parent eb37ffc
commit 177c1c6
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 83 deletions.
diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py
@@ -17,7 +17,7 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import CacheManager, TarballNotFound
+from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound
 
 
 class DatasetsInventory(ApiBase):
@@ -56,25 +56,16 @@ def _get(
 
         GET /api/v1/datasets/inventory/{dataset}/{target}
         """
-
         dataset = params.uri["dataset"]
         target = params.uri.get("target")
 
         cache_m = CacheManager(self.config, current_app.logger)
         try:
-            tarball = cache_m.find_dataset(dataset.resource_id)
+            file_info, file_stream = cache_m.extract(dataset, target)
         except TarballNotFound as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
 
-        if target is None:
-            file_path = tarball.tarball_path
-        else:
-            dataset_location = tarball.unpacked
-            if dataset_location is None:
-                raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
-            file_path = dataset_location / target
-
-        if file_path.is_file():
+        if file_info["type"] == CacheType.FILE:
             # Tell send_file to set `Content-Disposition` to "attachment" if
             # targeting the large binary tarball. Otherwise we'll recommend the
             # default "inline": only `is_file()` paths are allowed here, and
@@ -85,11 +76,11 @@ def _get(
             # NOTE: we could choose to be "smarter" based on file size, file
             # type codes (e.g., .csv, .json), and so forth.
             return send_file(
-                file_path,
+                file_stream,
                 as_attachment=target is None,
-                download_name=file_path.name,
+                download_name=file_info["name"],
             )
-        elif file_path.exists():
+        elif file_info["type"] != CacheType.FILE:
             raise APIAbort(
                 HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
                 "The specified path does not refer to a regular file",

diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -7,7 +7,7 @@
 import shutil
 import subprocess
 import tarfile
-from typing import Optional, Union
+from typing import IO, Optional, Union
 
 from pbench.common import MetadataLog, selinux
 from pbench.server import JSONOBJECT, PbenchServerConfig
@@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
                     raise BadDirpath(
                         f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
                     )
-
             return f_entries[path.name]
         except KeyError as exc:
             raise BadDirpath(
@@ -430,29 +429,61 @@ def get_info(self, path: Path) -> JSONOBJECT:
         return fd_info
 
     @staticmethod
-    def extract(tarball_path: Path, path: str) -> Optional[str]:
-        """Extract a file from the tarball and return it as a string
+    def filestream(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
+        """Extracts filestream from the tarball
+
+        Reports failure by raising Exceptions
+
+        Args:
+            path: relative path within the tarball
+
+        Raise:
+            BadDirpath on failure extracting the file from tarball
+        """
+        try:
+            return tarfile.open(tarball_path, "r:*").extractfile(path)
+        except Exception as exc:
+            raise BadDirpath(
+                f"A problem occurred processing {path!r} from {tarball_path!s}: {exc}"
+            ) from exc
+
+    @staticmethod
+    def filecontents(tarball_path: Path, path: Path) -> Optional[str]:
+        """Returns filestream as a string"""
+        return Tarball.filestream(tarball_path, path).read().decode()
 
-        Report failures by raising exceptions.
+    def extract(self, path: str) -> Optional[tuple[dict, IO[bytes]]]:
+        """Extracts the contents of a file from the Tarball
 
         Args:
             path: relative path within the tarball of a file
 
         Raises:
-            MetadataError on failure opening the tarball
             TarballUnpackError on failure to extract the named path
 
         Returns:
-            The named file as a string
+            Tuple with file information and contents/stream
         """
+        name = Path(self.name)
+        path = name / path
+        info = self.get_info(path)
+
+        # Fix: Currently if target is not provided we are returning tarball.
+        # Do we want to do the same or there is an alternative approach that
+        # we can just return the info
         try:
-            tar = tarfile.open(tarball_path, "r:*")
-        except Exception as exc:
-            raise MetadataError(tarball_path, exc) from exc
-        try:
-            return tar.extractfile(path).read().decode()
+            if info["type"] == CacheType.FILE:
+                # Fix: instead of name we can check for size and we can return
+                # contents for files maybe less than 20k?
+                if info["name"] == "metadata.log":
+                    return (info, Tarball.filecontents(self.tarball_path, path))
+                return (info, Tarball.filestream(self.tarball_path, path))
+            else:
+                return (info, None)
         except Exception as exc:
-            raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
+            raise TarballUnpackError(
+                self.tarball_path, f"Unable to extract {path!r}"
+            ) from exc
 
     @staticmethod
     def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
@@ -464,7 +495,7 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
         """
         name = Dataset.stem(tarball_path)
         try:
-            data = Tarball.extract(tarball_path, f"{name}/metadata.log")
+            data = Tarball.filecontents(tarball_path, f"{name}/metadata.log")
         except TarballUnpackError:
             data = None
         if data:
@@ -1005,6 +1036,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
         tmap = tarball.get_info(path)
         return tmap
 
+    def extract(self, dataset, target):
+        tarball = self.find_dataset(dataset.resource_id)
+        return tarball.extract(target)
+
     def uncache(self, dataset_id: str):
         """Remove the unpacked tarball tree.
 

diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -153,9 +153,10 @@ def fake_metadata_controller(tar_path):
         # fetching metadata from metadata.log file and key/value not
         # being there should result in a MetadataError
         source_tarball, source_md5, md5 = tarball
+        dataset_name = Dataset.stem(source_tarball)
         cm = CacheManager(server_config, make_logger)
 
-        expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
+        expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"A problem occurred processing '{dataset_name}/metadata.log' from {source_tarball!s}: Invalid Tarfile\""
         with monkeypatch.context() as m:
             m.setattr(tarfile, "open", fake_tarfile_open)
             with pytest.raises(MetadataError) as exc:
@@ -183,10 +184,12 @@ def fake_metadata_controller(tar_path):
                 cm.create(source_tarball)
             assert str(exc.value) == expected_metaerror
 
+        expected_msg = f"A problem occurred processing metadata.log from {source_tarball!s}: \"A problem occurred processing '{dataset_name}/metadata.log' from {source_tarball!s}: 'filename {dataset_name}/metadata.log not found'\""
         with monkeypatch.context() as m:
             m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
-            cm.create(source_tarball)
-            assert cm[md5].metadata is None
+            with pytest.raises(MetadataError) as exc:
+                cm.create(source_tarball)
+            assert str(exc.value) == expected_msg
 
     def test_with_metadata(
         self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
@@ -367,6 +370,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                         f11.txt
                         f12_sym -> ../../..
                     f1.json
+                    metadata.log
 
 
             Generated cache map
@@ -376,6 +380,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
                     'details': <cache_manager.FileInfo object>,
                     'children': {
                         'f1.json': {'details': <cache_manager.FileInfo object>},
+                        'metadata.log': {'details': <cache_manager.FileInfo object>},
                         'subdir1': {
                             'details': <cache_manager.FileInfo object>,
                             'children': {
@@ -412,6 +417,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
             sub_dir = tmp_path / dir_name
             sub_dir.mkdir(parents=True, exist_ok=True)
             (sub_dir / "f1.json").touch()
+            (sub_dir / "metadata.log").touch()
             for i in range(1, 4):
                 (sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
             (sub_dir / "subdir1" / "f11.txt").touch()
@@ -891,6 +897,120 @@ def test_cache_map_get_info_cmap(
             file_info = tb.get_info(Path(file_path))
             assert file_info == expected_msg
 
+    @pytest.mark.parametrize(
+        "file_path, expected_info, expected_msg",
+        [
+            (
+                "",
+                {
+                    "directories": ["subdir1"],
+                    "files": ["f1.json", "metadata.log"],
+                    "location": Path("dir_name"),
+                    "name": "dir_name",
+                    "resolve_path": None,
+                    "resolve_type": None,
+                    "size": None,
+                    "type": CacheType.DIRECTORY,
+                },
+                None,
+            ),
+            (
+                "f1.json",
+                {
+                    "location": Path("dir_name/f1.json"),
+                    "name": "f1.json",
+                    "resolve_path": None,
+                    "resolve_type": None,
+                    "size": 0,
+                    "type": CacheType.FILE,
+                },
+                "file_as_a_byte_stream",
+            ),
+            (
+                "metadata.log",
+                {
+                    "location": Path("dir_name/metadata.log"),
+                    "name": "metadata.log",
+                    "resolve_path": None,
+                    "resolve_type": None,
+                    "size": 0,
+                    "type": CacheType.FILE,
+                },
+                "file_contents",
+            ),
+            (
+                "subdir1/subdir12",
+                {
+                    "directories": [],
+                    "files": [],
+                    "location": Path("dir_name/subdir1/subdir12"),
+                    "name": "subdir12",
+                    "resolve_path": None,
+                    "resolve_type": None,
+                    "size": None,
+                    "type": CacheType.DIRECTORY,
+                },
+                None,
+            ),
+        ],
+    )
+    def test_extract(
+        self, monkeypatch, tmp_path, file_path, expected_info, expected_msg
+    ):
+        """Test to extract file contents/stream from a file"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        def mock_filestream(tarball_path, path):
+            return "file_as_a_byte_stream"
+
+        def mock_filecontents(tarball_path, path):
+            return "file_contents"
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(Tarball, "filestream", mock_filestream)
+            m.setattr(Tarball, "filecontents", mock_filecontents)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+            info, content = tb.extract(file_path)
+            assert info == expected_info
+            assert content == expected_msg
+
+    def test_extract_tarfile_open(self, monkeypatch, tmp_path):
+        """Test to check non-existent file or tarfile unpack issue"""
+        tar = Path("/mock/dir_name.tar.xz")
+        cache = Path("/mock/.cache")
+
+        def fake_tarfile_open(self, path):
+            raise tarfile.TarError("Invalid Tarfile")
+
+        with monkeypatch.context() as m:
+            m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
+            m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
+            m.setattr(tarfile, "open", fake_tarfile_open)
+            tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
+            tar_dir = TestCacheManager.MockController.generate_test_result_tree(
+                tmp_path, "dir_name"
+            )
+            tb.cache_map(tar_dir)
+
+            path = Path(tb.name) / "subdir1/f11.txt"
+            expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {path!r}"
+            with pytest.raises(TarballUnpackError) as exc:
+                info, content = tb.extract("subdir1/f11.txt")
+            assert str(exc.value) == expected_error_msg
+
+            path = Path(tb.name) / "metadata.log"
+            expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {path!r}"
+            with pytest.raises(TarballUnpackError) as exc:
+                info, content = tb.extract("metadata.log")
+            assert str(exc.value) == expected_error_msg
+
     def test_find(
         self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
     ):