Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PBENCH-1014 Using Tarball.extract in Inventory API for extracting files from tarball #3105

Merged
merged 10 commits into from
Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 11 additions & 22 deletions lib/pbench/server/api/resources/datasets_inventory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ParamType,
Schema,
)
from pbench.server.cache_manager import CacheManager, TarballNotFound
from pbench.server.cache_manager import CacheManager, CacheType, TarballNotFound


class DatasetsInventory(ApiBase):
Expand Down Expand Up @@ -54,27 +54,18 @@ def _get(
APIAbort, reporting either "NOT_FOUND" or "UNSUPPORTED_MEDIA_TYPE"


GET /api/v1/datasets/inventory/{dataset}/{target}
GET /api/v1/datasets/{dataset}/inventory/{target}
"""

dataset = params.uri["dataset"]
target = params.uri.get("target")

cache_m = CacheManager(self.config, current_app.logger)
try:
tarball = cache_m.find_dataset(dataset.resource_id)
file_info = cache_m.filestream(dataset, target)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a function named "filestream" return a "file information" dictionary doesn't seem like the best interface. Maybe something somewhat more generic, like "get_file()", would be better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, reality hasn't kept up with the implementation, and at some point we should consider whipping it into shape.

except TarballNotFound as e:
raise APIAbort(HTTPStatus.NOT_FOUND, str(e))

if target is None:
file_path = tarball.tarball_path
else:
dataset_location = tarball.unpacked
if dataset_location is None:
raise APIAbort(HTTPStatus.NOT_FOUND, "The dataset is not unpacked")
file_path = dataset_location / target

if file_path.is_file():
if file_info["type"] == CacheType.FILE:
dbutenhof marked this conversation as resolved.
Show resolved Hide resolved
# Tell send_file to set `Content-Disposition` to "attachment" if
# targeting the large binary tarball. Otherwise we'll recommend the
# default "inline": only `is_file()` paths are allowed here, and
Expand All @@ -84,17 +75,15 @@ def _get(
#
# NOTE: we could choose to be "smarter" based on file size, file
# type codes (e.g., .csv, .json), and so forth.
return send_file(
file_path,
resp = send_file(
file_info["stream"],
as_attachment=target is None,
download_name=file_path.name,
)
elif file_path.exists():
raise APIAbort(
HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
"The specified path does not refer to a regular file",
Comment on lines -93 to -95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am a bit behind on this one but why are we removing UNSUPPORTED_MEDIA_TYPE and return every error as 404 now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tarball.extract method was handling all error's around this. So, I didn't find the need to include these lines.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache manager raises an internal error, but we don't want that to fall through to the generic (internal error) except Exception in _dispatch. That's why we have APIAbort, which is treated specially. We want to make sure that the referenced file path exists in the tarball and is a regular file. I'm not sure exactly what tarfile.extractfile will return/raise in this case, but we need to specifically catch that case and raise APIAbort with UNSUPPORTED_MEDIA_TYPE.

download_name=file_info["name"],
)
file_info["stream"].close()
return resp
else:
raise APIAbort(
HTTPStatus.NOT_FOUND, "The specified path does not refer to a file"
HTTPStatus.BAD_REQUEST,
"The specified path does not refer to a regular file",
)
84 changes: 59 additions & 25 deletions lib/pbench/server/cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import shutil
import subprocess
import tarfile
from typing import Optional, Union
from typing import IO, Optional, Union

from pbench.common import MetadataLog, selinux
from pbench.server import JSONOBJECT, PbenchServerConfig
Expand Down Expand Up @@ -373,7 +373,6 @@ def traverse_cmap(path: Path, cachemap: dict) -> dict[str, dict]:
raise BadDirpath(
f"Found a file {file_l!r} where a directory was expected in path {str(path)!r}"
)

return f_entries[path.name]
except KeyError as exc:
raise BadDirpath(
Expand Down Expand Up @@ -430,29 +429,57 @@ def get_info(self, path: Path) -> JSONOBJECT:
return fd_info

@staticmethod
def extract(tarball_path: Path, path: str) -> Optional[str]:
"""Extract a file from the tarball and return it as a string
def extract(tarball_path: Path, path: Path) -> Optional[IO[bytes]]:
"""Returns the file stream which yields the contents of
a file at the specified path in the Tarball

Args:
tarball_path: absolute path of the tarball
path: relative path within the tarball

Raise:
BadDirpath on failure extracting the file from tarball
"""
try:
return tarfile.open(tarball_path, "r:*").extractfile(path)
dbutenhof marked this conversation as resolved.
Show resolved Hide resolved
except Exception as exc:
raise BadDirpath(
f"A problem occurred processing {str(path)!r} from {str(tarball_path)!r}: {exc}"
)

Report failures by raising exceptions.
def filestream(self, path: str) -> Optional[dict]:
"""Returns a dictionary containing information about the target
file and file stream

Args:
path: relative path within the tarball of a file

Raises:
MetadataError on failure opening the tarball
TarballUnpackError on failure to extract the named path

Returns:
The named file as a string
Dictionary with file info and file stream
"""
try:
tar = tarfile.open(tarball_path, "r:*")
except Exception as exc:
raise MetadataError(tarball_path, exc) from exc
try:
return tar.extractfile(path).read().decode()
except Exception as exc:
raise TarballUnpackError(tarball_path, f"Unable to extract {path}") from exc
if not path:
info = {
"name": self.name,
"type": CacheType.FILE,
"stream": self.tarball_path.open("rb"),
}
else:
file_path = Path(self.name) / path
info = self.get_info(file_path)
if info["type"] == CacheType.FILE:
dbutenhof marked this conversation as resolved.
Show resolved Hide resolved
try:
info["stream"] = Tarball.extract(self.tarball_path, file_path)
except Exception as exc:
raise TarballUnpackError(
self.tarball_path, f"Unable to extract {str(file_path)!r}"
) from exc
else:
info["stream"] = None

return info

@staticmethod
def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
Expand All @@ -465,15 +492,13 @@ def _get_metadata(tarball_path: Path) -> Optional[JSONOBJECT]:
name = Dataset.stem(tarball_path)
try:
data = Tarball.extract(tarball_path, f"{name}/metadata.log")
except TarballUnpackError:
data = None
if data:
metadata = MetadataLog()
metadata.read_string(data)
metadata = {s: dict(metadata.items(s)) for s in metadata.sections()}
return metadata
else:
except BadDirpath:
return None
else:
metadata_log = MetadataLog()
metadata_log.read_file(e.decode() for e in data)
metadata = {s: dict(metadata_log.items(s)) for s in metadata_log.sections()}
return metadata

@staticmethod
def subprocess_run(
Expand Down Expand Up @@ -946,6 +971,13 @@ def create(self, tarfile: Path) -> Tarball:
controller: associated controller name
tarfile: dataset tarball path

Raises
BadDirpath: Failure on extracting the file from tarball
MetadataError: Failure on getting metadata from metadata.log file
in the tarball
BadFilename: A bad tarball path was given
DuplicateTarball: A duplicate tarball name was detected

Returns
Tarball object
"""
Expand All @@ -955,8 +987,6 @@ def create(self, tarfile: Path) -> Tarball:
controller_name = metadata["run"]["controller"]
else:
controller_name = "unknown"
except MetadataError:
raise
except Exception as exc:
raise MetadataError(tarfile, exc)

Expand Down Expand Up @@ -1005,6 +1035,10 @@ def get_info(self, dataset_id: str, path: Path) -> dict:
tmap = tarball.get_info(path)
return tmap

def filestream(self, dataset, target):
tarball = self.find_dataset(dataset.resource_id)
return tarball.filestream(target)

def uncache(self, dataset_id: str):
"""Remove the unpacked tarball tree.

Expand Down
76 changes: 58 additions & 18 deletions lib/pbench/test/unit/server/test_cache_manager.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import io
from logging import Logger
import os
from pathlib import Path
Expand Down Expand Up @@ -132,12 +133,6 @@ def test_metadata(
):
"""Test behavior with metadata.log access errors."""

def fake_extract_file(self, path):
raise KeyError(f"filename {path} not found")

def fake_tarfile_open(self, path):
raise tarfile.TarError("Invalid Tarfile")

def fake_metadata(tar_path):
return {"pbench": {"date": "2002-05-16T00:00:00"}}

Expand All @@ -155,13 +150,6 @@ def fake_metadata_controller(tar_path):
source_tarball, source_md5, md5 = tarball
cm = CacheManager(server_config, make_logger)

expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: 'Invalid Tarfile'"
with monkeypatch.context() as m:
m.setattr(tarfile, "open", fake_tarfile_open)
with pytest.raises(MetadataError) as exc:
cm.create(source_tarball)
assert str(exc.value) == expected_metaerror

expected_metaerror = f"A problem occurred processing metadata.log from {source_tarball!s}: \"'run'\""
with monkeypatch.context() as m:
m.setattr(Tarball, "_get_metadata", fake_metadata)
Expand All @@ -183,11 +171,6 @@ def fake_metadata_controller(tar_path):
cm.create(source_tarball)
assert str(exc.value) == expected_metaerror

with monkeypatch.context() as m:
m.setattr(tarfile.TarFile, "extractfile", fake_extract_file)
cm.create(source_tarball)
assert cm[md5].metadata is None

def test_with_metadata(
self, monkeypatch, selinux_disabled, server_config, make_logger, tarball
):
Expand Down Expand Up @@ -367,6 +350,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
f11.txt
f12_sym -> ../../..
f1.json
metadata.log


Generated cache map
Expand All @@ -376,6 +360,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
'details': <cache_manager.FileInfo object>,
'children': {
'f1.json': {'details': <cache_manager.FileInfo object>},
'metadata.log': {'details': <cache_manager.FileInfo object>},
'subdir1': {
'details': <cache_manager.FileInfo object>,
'children': {
Expand Down Expand Up @@ -412,6 +397,7 @@ def generate_test_result_tree(tmp_path: Path, dir_name: str) -> Path:
sub_dir = tmp_path / dir_name
sub_dir.mkdir(parents=True, exist_ok=True)
(sub_dir / "f1.json").touch()
(sub_dir / "metadata.log").touch()
for i in range(1, 4):
(sub_dir / "subdir1" / f"subdir1{i}").mkdir(parents=True, exist_ok=True)
(sub_dir / "subdir1" / "f11.txt").touch()
Expand Down Expand Up @@ -891,6 +877,60 @@ def test_cache_map_get_info_cmap(
file_info = tb.get_info(Path(file_path))
assert file_info == expected_msg

@pytest.mark.parametrize(
"file_path, exp_file_type, exp_stream",
[
("", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
webbnh marked this conversation as resolved.
Show resolved Hide resolved
(None, CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
("f1.json", CacheType.FILE, io.BytesIO(b"tarball_as_a_byte_stream")),
("subdir1/subdir12", CacheType.DIRECTORY, None),
],
)
def test_filestream(
self, monkeypatch, tmp_path, file_path, exp_file_type, exp_stream
):
"""Test to extract file contents/stream from a file"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(Tarball, "extract", lambda _t, _p: exp_stream)
m.setattr(Path, "open", lambda _s, _m="rb": exp_stream)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)
file_info = tb.filestream(file_path)
assert file_info["type"] == exp_file_type
assert file_info["stream"] == exp_stream

def test_filestream_tarfile_open(self, monkeypatch, tmp_path):
"""Test to check non-existent file or tarfile unpack issue"""
tar = Path("/mock/dir_name.tar.xz")
cache = Path("/mock/.cache")

def fake_tarfile_open(self, path):
raise tarfile.TarError("Invalid Tarfile")

with monkeypatch.context() as m:
m.setattr(Tarball, "__init__", TestCacheManager.MockTarball.__init__)
m.setattr(Controller, "__init__", TestCacheManager.MockController.__init__)
m.setattr(tarfile, "open", fake_tarfile_open)
tb = Tarball(tar, Controller(Path("/mock/archive"), cache, None))
tar_dir = TestCacheManager.MockController.generate_test_result_tree(
tmp_path, "dir_name"
)
tb.cache_map(tar_dir)

path = Path(tb.name) / "subdir1/f11.txt"
expected_error_msg = f"An error occurred while unpacking {tb.tarball_path}: Unable to extract {str(path)!r}"
with pytest.raises(TarballUnpackError) as exc:
tb.filestream("subdir1/f11.txt")
assert str(exc.value) == expected_error_msg

def test_find(
self, selinux_enabled, server_config, make_logger, tarball, monkeypatch
):
Expand Down
Loading