Skip to content

Commit

Permalink
PBENCH-1014
Browse files Browse the repository at this point in the history
Using Tarball.extract in Inventory API to extract file
content from tarball.
  • Loading branch information
riya-17 committed Jun 16, 2023
1 parent eb37ffc commit 177c1c6
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 83 deletions.
21 changes: 6 additions & 15 deletions lib/pbench/server/api/resources/datasets_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import CacheManager, TarballNotFound
from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound


class DatasetsInventory(ApiBase):
Expand Down Expand Up @@ -56,25 +56,16 @@ def _get(
GET /api/v1/datasets/inventory/{dataset}/{target}
"""

dataset = params.uri["dataset"]
target = params.uri.get("target")

cache_m = CacheManager(self.config, current_app.logger)
try:
tarball = cache_m.find_dataset(dataset.resource_id)
file_info, file_stream = cache_m.extract(dataset, target)
except TarballNotFound as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))

if target is None:
file_path = tarball.tarball_path
else:
dataset_location = tarball.unpacked
if dataset_location is None:
raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
file_path = dataset_location / target

if file_path.is_file():
if file_info["type"] == CacheType.FILE:
# Tell send_file to set `Content-Disposition` to "attachment" if
# targeting the large binary tarball. Otherwise we'll recommend the
# default "inline": only `is_file()` paths are allowed here, and
Expand All @@ -85,11 +76,11 @@ def _get(
# NOTE: we could choose to be "smarter" based on file size, file
# type codes (e.g., .csv, .json), and so forth.
return send_file(
file_path,
file_stream,
as_attachment=target is None,
download_name=file_path.name,
download_name=file_info["name"],
)
elif file_path.exists():
elif file_info["type"] != CacheType.FILE:
raise APIAbort(
HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
"The specified path does not refer to a regular file",
Expand Down
63 changes: 49 additions & 14 deletions lib/pbench/server/cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import shutil
import subprocess
import tarfile
from typing import Optional, Union
from typing import IO, Optional, Union

from pbench.common import MetadataLog, selinux
from pbench.server import JSONOBJECT, PbenchServerConfig
Expand Down Expand Up @@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
raise BadDirpath(
f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
)

return f_entries[path.name]
except KeyError as exc:
raise BadDirpath(
Expand Down Expand Up @@ -430,29 +429,61 @@ def get_info(self, path: Path) -> JSONOBJECT:
return fd_info

@staticmethod
def extract(tarball_path: Path, path: str) -> Optional[str]:
"""Extract a file from the tarball and return it as a string
def filestream(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
"""Extracts filestream from the tarball
Reports failure by raising Exceptions
Args:
path: relative path within the tarball
Raise:
BadDirpath on failure extracting the file from tarball
"""
try:
return tarfile.open(tarball_path, "r:*").extractfile(path)
except Exception as exc:
raise BadDirpath(
f"A problem occurred processing {path!r} from {tarball_path!s}: {exc}"
) from exc

@staticmethod
def filecontents(tarball_path: Path, path: Path) -> Optional[str]:
"""Returns filestream as a string"""
return Tarball.filestream(tarball_path, path).read().decode()

Report failures by raising exceptions.
def extract(self, path: str) -> Optional[tuple[dict, IO[bytes]]]:
"""Extracts the contents of a file from the Tarball
Args:
path: relative path within the tarball of a file
Raises:
MetadataError on failure opening the tarball
TarballUnpackError on failure to extract the named path
Returns:
The named file as a string
Tuple with file information and contents/stream
"""
name = Path(self.name)
path = name / path
info = self.get_info(path)

# Fix: Currently if target is not provided we are returning tarball.
# Do we want to do the same or there is an alternative approach that
# we can just return the info
try:
tar = tarfile.open(tarball_path, "r:*")
except Exception as exc:
raise MetadataError(tarball_path, exc) from exc
try:
return tar.extractfile(path).read().decode()
if info["type"] == CacheType.FILE:
# Fix: instead of name we can check for size and we can return
# contents for files maybe less than 20k?
if info["name"] == "metadata.log":
return (info, Tarball.filecontents(self.tarball_path, path))
return (info, Tarball.filestream(self.tarball_path, path))
else:
return (info, None)
except Exception as exc:
raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
raise TarballUnpackError(
self.tarball_path, f"Unable to extract {path!r}"
) from exc

@staticmethod
def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
Expand All @@ -464,7 +495,7 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
"""
name = Dataset.stem(tarball_path)
try:
data = Tarball.extract(tarball_path, f"{name}/metadata.log")
data = Tarball.filecontents(tarball_path, f"{name}/metadata.log")
except TarballUnpackError:
data = None
if data:
Expand Down Expand Up @@ -1005,6 +1036,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
tmap = tarball.get_info(path)
return tmap

def extract(self, dataset, target):
tarball = self.find_dataset(dataset.resource_id)
return tarball.extract(target)

def uncache(self, dataset_id: str):
"""Remove the unpacked tarball tree.
Expand Down
126 changes: 123 additions & 3 deletions lib/pbench/test/unit/server/test_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,10 @@ def fake_metadata_controller(tar_path):
# fetching metadata from metadata.log file and key/value not
# being there should result in a MetadataError
source_tarball, source_md5, md5 = tarball
dataset_name = Dataset.stem(source_tarball)
cm = CacheManager(server_config, make_logger)

expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"A problem occurred processing '{dataset_name}/metadata.log' from {source_tarball!s}: Invalid Tarfile\""
with monkeypatch.context() as m:
m.setattr(tarfile, "open", fake_tarfile_open)
with pytest.raises(MetadataError) as exc:
Expand Down Expand Up @@ -183,10 +184,12 @@ def fake_metadata_controller(tar_path):
cm.create(source_tarball)
assert str(exc.value) == expected_metaerror

expected_msg = f"A problem occurred processing metadata.log from {source_tarball!s}: \"A problem occurred processing '{dataset_name}/metadata.log' from {source_tarball!s}: 'filename {dataset_name}/metadata.log not found'\""
with monkeypatch.context() as m:
m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
cm.create(source_tarball)
assert cm[md5].metadata is None
with pytest.raises(MetadataError) as exc:
cm.create(source_tarball)
assert str(exc.value) == expected_msg

def test_with_metadata(
self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
Expand Down Expand Up @@ -367,6 +370,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
f11.txt
f12_sym -> ../../..
f1.json
metadata.log
Generated cache map
Expand All @@ -376,6 +380,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
'details': <cache_manager.FileInfo object>,
'children': {
'f1.json': {'details': <cache_manager.FileInfo object>},
'metadata.log': {'details': <cache_manager.FileInfo object>},
'subdir1': {
'details': <cache_manager.FileInfo object>,
'children': {
Expand Down Expand Up @@ -412,6 +417,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
sub_dir = tmp_path / dir_name
sub_dir.mkdir(parents=True, exist_ok=True)
(sub_dir / "f1.json").touch()
(sub_dir / "metadata.log").touch()
for i in range(1, 4):
(sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
(sub_dir / "subdir1" / "f11.txt").touch()
Expand Down Expand Up @@ -891,6 +897,120 @@ def test_cache_map_get_info_cmap(
file_info = tb.get_info(Path(file_path))
assert file_info == expected_msg

@pytest.mark.parametrize(
"file_path, expected_info, expected_msg",
[
(
"",
{
"directories": ["subdir1"],
"files": ["f1.json", "metadata.log"],
"location": Path("dir_name"),
"name": "dir_name",
"resolve_path": None,
"resolve_type": None,
"size": None,
"type": CacheType.DIRECTORY,
},
None,
),
(
"f1.json",
{
"location": Path("dir_name/f1.json"),
"name": "f1.json",
"resolve_path": None,
"resolve_type": None,
"size": 0,
"type": CacheType.FILE,
},
"file_as_a_byte_stream",
),
(
"metadata.log",
{
"location": Path("dir_name/metadata.log"),
"name": "metadata.log",
"resolve_path": None,
"resolve_type": None,
"size": 0,
"type": CacheType.FILE,
},
"file_contents",
),
(
"subdir1/subdir12",
{
"directories": [],
"files": [],
"location": Path("dir_name/subdir1/subdir12"),
"name": "subdir12",
"resolve_path": None,
"resolve_type": None,
"size": None,
"type": CacheType.DIRECTORY,
},
None,
),
],
)
def test_extract(
self, monkeypatch, tmp_path, file_path, expected_info, expected_msg
):
"""Test to extract file contents/stream from a file"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

def mock_filestream(tarball_path, path):
return "file_as_a_byte_stream"

def mock_filecontents(tarball_path, path):
return "file_contents"

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(Tarball, "filestream", mock_filestream)
m.setattr(Tarball, "filecontents", mock_filecontents)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)
info, content = tb.extract(file_path)
assert info == expected_info
assert content == expected_msg

def test_extract_tarfile_open(self, monkeypatch, tmp_path):
"""Test to check non-existent file or tarfile unpack issue"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

def fake_tarfile_open(self, path):
raise tarfile.TarError("Invalid Tarfile")

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(tarfile, "open", fake_tarfile_open)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)

path = Path(tb.name) / "subdir1/f11.txt"
expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {path!r}"
with pytest.raises(TarballUnpackError) as exc:
info, content = tb.extract("subdir1/f11.txt")
assert str(exc.value) == expected_error_msg

path = Path(tb.name) / "metadata.log"
expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {path!r}"
with pytest.raises(TarballUnpackError) as exc:
info, content = tb.extract("metadata.log")
assert str(exc.value) == expected_error_msg

def test_find(
self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
):
Expand Down
Loading

0 comments on commit 177c1c6

Please sign in to comment.