Using Tarball.extract for extracting files from tarball

distributed-system-analysis · Dec 6, 2022 · 781c8cd · 781c8cd
1 parent b4a053b
commit 781c8cd
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 57 deletions.
diff --git a/lib/pbench/server/api/resources/datasets_inventory.py b/lib/pbench/server/api/resources/datasets_inventory.py
@@ -1,5 +1,6 @@
 from http import HTTPStatus
 from logging import Logger
+import os
 
 from flask import send_file
 from flask.wrappers import Response
@@ -17,7 +18,7 @@
     ParamType,
     Schema,
 )
-from pbench.server.cache_manager import CacheManager, TarballNotFound
+from pbench.server.cache_manager import CacheManager, Tarball, TarballNotFound
 
 
 class DatasetsInventory(ApiBase):
@@ -63,22 +64,13 @@ def _get(self, params: ApiParams, _) -> Response:
         except TarballNotFound as e:
             raise APIAbort(HTTPStatus.NOT_FOUND, str(e))
 
-        if target is None:
-            file_path = tarball.tarball_path
-        else:
-            dataset_location = tarball.unpacked
-            if dataset_location is None:
-                raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
-            file_path = dataset_location / target
+        tarball_name = tarball.tarball_path.name
+        target_path = (
+            tarball_name if target is None else (os.path.join(tarball_name, target))
+        )
+        try:
+            file_path = Tarball.extract(str(tarball.tarball_path), target_path)
+        except Exception as exc:
+            raise APIAbort(HTTPStatus.NOT_FOUND, str(exc))
 
-        if file_path.is_file():
-            return send_file(file_path)
-        elif file_path.exists():
-            raise APIAbort(
-                HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
-                "The specified path does not refer to a regular file",
-            )
-        else:
-            raise APIAbort(
-                HTTPStatus.NOT_FOUND, "The specified path does not refer to a file"
-            )
+        return send_file(file_path)
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -60,15 +60,16 @@ def __str__(self) -> str:
 
 class MetadataError(CacheManagerError):
     """
-    A problem was found locating or processing a tarball's metadata.log file.
+    A problem was found locating or processing a tarball file.
     """
 
-    def __init__(self, tarball: Path, error: Exception):
+    def __init__(self, tarball: str, file_name: str, error: Exception):
         self.tarball = tarball
+        self.file_name = file_name
         self.error = str(error)
 
     def __str__(self) -> str:
-        return f"A problem occurred processing metadata.log from {self.tarball!s}: {self.error!r}"
+        return f"A problem occurred processing `{Path(self.file_name).name}` from {self.tarball!s}: {self.error!r}"
 
 
 class TarballUnpackError(CacheManagerError):
@@ -266,11 +267,13 @@ def create(cls, tarball: Path, controller: "Controller") -> "Tarball":
 
         return cls(destination, controller)
 
-    def extract(self, path: str) -> str:
+    @staticmethod
+    def extract(tarball_path: str, path: str) -> str:
         """
         Extract a file from the tarball and return it as a string
 
         Args:
+            tarball_path: path for the tarball
             path: relative path within the tarball of a file
 
         Raises:
@@ -280,11 +283,9 @@ def extract(self, path: str) -> str:
             The named file as a string
         """
         try:
-            return (
-                tarfile.open(self.tarball_path, "r:*").extractfile(path).read().decode()
-            )
+            return tarfile.open(tarball_path, "r:*").extractfile(path).read().decode()
         except Exception as exc:
-            raise MetadataError(self.tarball_path, exc)
+            raise MetadataError(tarball_path, path, exc)
 
     def get_metadata(self) -> JSONOBJECT:
         """

diff --git a/lib/pbench/test/unit/server/test_datasets_inventory.py b/lib/pbench/test/unit/server/test_datasets_inventory.py
@@ -5,7 +5,7 @@
 import requests
 import werkzeug.utils
 
-from pbench.server.cache_manager import CacheManager
+from pbench.server.cache_manager import CacheManager, Tarball, MetadataError
 from pbench.server.database.models.datasets import Dataset, DatasetNotFound
 
 
@@ -68,69 +68,72 @@ def test_unauthorized_access(self, query_get_as):
             "message": "User drb is not authorized to READ a resource owned by test with private access"
         }
 
-    def test_dataset_is_not_unpacked(self, query_get_as, monkeypatch):
-        def mock_find_not_unpacked(self, dataset):
-            class Tarball(object):
-                unpacked = None
+    def test_path_is_file(self, query_get_as, monkeypatch):
+        def mock_extract(tarball_path, path):
+            raise MetadataError(tarball_path, path, f"AttributeError: 'NoneType' object has no attribute 'read'")
 
-            # Validate the resource_id
-            Dataset.query(resource_id=dataset)
-            return Tarball
-
-        monkeypatch.setattr(CacheManager, "find_dataset", mock_find_not_unpacked)
-
-        response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND)
-        assert response.json == {"message": "The dataset is not unpacked"}
-
-    def test_path_is_directory(self, query_get_as, monkeypatch):
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(Tarball, "extract", mock_extract)
         monkeypatch.setattr(Path, "is_file", lambda self: False)
         monkeypatch.setattr(Path, "exists", lambda self: True)
 
-        response = query_get_as("fio_2", "1-default", HTTPStatus.UNSUPPORTED_MEDIA_TYPE)
+        response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND)
         assert response.json == {
-            "message": "The specified path does not refer to a regular file"
+            "message": "A problem occurred processing `1-default` from /dataset_tarball: \"AttributeError: 'NoneType' object has no attribute 'read'\""
         }
 
-    def test_not_a_file(self, query_get_as, monkeypatch):
+    def test_path_exist(self, query_get_as, monkeypatch):
+        def mock_extract(tarball_path, path):
+            raise MetadataError(tarball_path, path, f'KeyError: "filename \'{path}\' not found"')
+
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(Tarball, "extract", mock_extract)
         monkeypatch.setattr(Path, "is_file", lambda self: False)
-        monkeypatch.setattr(Path, "exists", lambda self: False)
+        monkeypatch.setattr(Path, "exists", lambda self: True)
 
-        response = query_get_as("fio_2", "1-default", HTTPStatus.NOT_FOUND)
+        response = query_get_as("fio_2", "metadata.log", HTTPStatus.NOT_FOUND)
         assert response.json == {
-            "message": "The specified path does not refer to a file"
-        }
+            "message": 'A problem occurred processing `metadata.log` from /dataset_tarball: \'KeyError: "filename \\\'dataset_tarball/metadata.log\\\' not found"\''}
 
-    def test_dataset_in_given_path(self, query_get_as, monkeypatch):
+    @pytest.mark.parametrize("key", (None, ""))
+    def test_empty_target_value(self, query_get_as, monkeypatch, key):
         file_sent = None
 
+        def mock_extract(tarball_path, path):
+            raise MetadataError(tarball_path, path, f"AttributeError: 'NoneType' object has no attribute 'read'")
+
         def mock_send_file(path_or_file, *args, **kwargs):
             nonlocal file_sent
             file_sent = path_or_file
             return {"status": "OK"}
 
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(Tarball, "extract", mock_extract)
         monkeypatch.setattr(Path, "is_file", lambda self: True)
         monkeypatch.setattr(werkzeug.utils, "send_file", mock_send_file)
 
-        response = query_get_as("fio_2", "1-default/default.csv", HTTPStatus.OK)
-        assert response.status_code == HTTPStatus.OK
-        assert str(file_sent) == "/dataset/1-default/default.csv"
+        response = query_get_as("fio_2", key, HTTPStatus.NOT_FOUND)
+        assert response.status_code == HTTPStatus.NOT_FOUND
+        assert response.json == {
+            "message": "A problem occurred processing `dataset_tarball` from /dataset_tarball: \"AttributeError: 'NoneType' object has no attribute 'read'\""
+        }
 
-    @pytest.mark.parametrize("key", (None, ""))
-    def test_get_result_tarball(self, query_get_as, monkeypatch, key):
+    def test_dataset_in_given_path(self, query_get_as, monkeypatch):
         file_sent = None
 
+        def mock_extract(tarball_path, path):
+            return path
+
         def mock_send_file(path_or_file, *args, **kwargs):
             nonlocal file_sent
             file_sent = path_or_file
             return {"status": "OK"}
 
         monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(Tarball, "extract", mock_extract)
         monkeypatch.setattr(Path, "is_file", lambda self: True)
         monkeypatch.setattr(werkzeug.utils, "send_file", mock_send_file)
 
-        response = query_get_as("fio_2", key, HTTPStatus.OK)
+        response = query_get_as("fio_2", "1-default/default.csv", HTTPStatus.OK)
         assert response.status_code == HTTPStatus.OK
-        assert str(file_sent) == "/dataset_tarball"
+        assert str(file_sent) == "dataset_tarball/1-default/default.csv"