PBENCH-1127 Implementation of Quisby API (#3463)

Integrated pquisby package into the Pbench server. The first pass of this API implementation will be used for retrieving quisby data for single dataset visualization. Currently, `pquisby` supports only the `uperf` benchmark; eventually, we will increase the support for other benchmarks too. We fetch the benchmark type from `dataset.metalog.pbench.script`. Unfortunately this can't recognize a `pbench-user-benchmark` with a `uperf` command. Solving this issue will require migrating Agent postprocessing into the server as `pbench-user-benchmark` won't create the `result.csv` file. `GET /api/v1/datasets/{dataset}/visualize` Co-authored-by: siddardh <sira@redhat27!>
distributed-system-analysis · Jun 21, 2023 · 68d543d · 68d543d
1 parent d8e6b81
commit 68d543d
Show file tree

Hide file tree

Showing 9 changed files with 245 additions and 13 deletions.
diff --git a/lib/pbench/client/__init__.py b/lib/pbench/client/__init__.py
@@ -47,6 +47,7 @@ class API(Enum):
     DATASETS_NAMESPACE = "datasets_namespace"
     DATASETS_SEARCH = "datasets_search"
     DATASETS_VALUES = "datasets_values"
+    DATASETS_VISUALIZE = "datasets_visualize"
     ENDPOINTS = "endpoints"
     KEY = "key"
     RELAY = "relay"

diff --git a/lib/pbench/server/api/__init__.py b/lib/pbench/server/api/__init__.py
@@ -17,6 +17,7 @@
 from pbench.server.api.resources.datasets_inventory import DatasetsInventory
 from pbench.server.api.resources.datasets_list import DatasetsList
 from pbench.server.api.resources.datasets_metadata import DatasetsMetadata
+from pbench.server.api.resources.datasets_visualize import DatasetsVisualize
 from pbench.server.api.resources.endpoint_configure import EndpointConfig
 from pbench.server.api.resources.query_apis.dataset import Datasets
 from pbench.server.api.resources.query_apis.datasets.datasets_contents import (
@@ -119,6 +120,12 @@ def register_endpoints(api: Api, app: Flask, config: PbenchServerConfig):
         endpoint="datasets_search",
         resource_class_args=(config,),
     )
+    api.add_resource(
+        DatasetsVisualize,
+        f"{base_uri}/datasets/<string:dataset>/visualize",
+        endpoint="datasets_visualize",
+        resource_class_args=(config,),
+    )
     api.add_resource(
         EndpointConfig,
         f"{base_uri}/endpoints",

diff --git a/lib/pbench/server/api/resources/datasets_visualize.py b/lib/pbench/server/api/resources/datasets_visualize.py
@@ -0,0 +1,99 @@
+from http import HTTPStatus
+from urllib.request import Request
+
+from flask import current_app, jsonify
+from flask.wrappers import Response
+from pquisby.lib.post_processing import BenchmarkName, InputType, QuisbyProcessing
+
+from pbench.server import OperationCode, PbenchServerConfig
+from pbench.server.api.resources import (
+    APIAbort,
+    ApiAuthorizationType,
+    ApiBase,
+    ApiContext,
+    APIInternalError,
+    ApiMethod,
+    ApiParams,
+    ApiSchema,
+    Parameter,
+    ParamType,
+    Schema,
+)
+from pbench.server.cache_manager import (
+    CacheManager,
+    TarballNotFound,
+    TarballUnpackError,
+)
+from pbench.server.database import Dataset
+
+
+class DatasetsVisualize(ApiBase):
+    """
+    This class implements the Server API used to retrieve data for visualization.
+    """
+
+    def __init__(self, config: PbenchServerConfig):
+        super().__init__(
+            config,
+            ApiSchema(
+                ApiMethod.GET,
+                OperationCode.READ,
+                uri_schema=Schema(
+                    Parameter("dataset", ParamType.DATASET, required=True),
+                ),
+                authorization=ApiAuthorizationType.DATASET,
+            ),
+        )
+
+    def _get(
+        self, params: ApiParams, request: Request, context: ApiContext
+    ) -> Response:
+        """
+        This function is using Quisby to process results into a form that supports visualization
+
+        Args:
+            params: includes the uri parameters, which provide the dataset.
+            request: Original incoming Request object
+            context: API context dictionary
+
+        Raises:
+            APIAbort, reporting "NOT_FOUND" and "INTERNAL_SERVER_ERROR"
+
+        GET /api/v1/visualize/{dataset}
+        """
+
+        dataset = params.uri["dataset"]
+        cache_m = CacheManager(self.config, current_app.logger)
+
+        try:
+            tarball = cache_m.find_dataset(dataset.resource_id)
+        except TarballNotFound as e:
+            raise APIAbort(
+                HTTPStatus.NOT_FOUND, f"No dataset with ID '{e.tarball}' found"
+            ) from e
+
+        metadata = self._get_dataset_metadata(
+            dataset, ["dataset.metalog.pbench.script"]
+        )
+        benchmark = metadata["dataset.metalog.pbench.script"].upper()
+        benchmark_type = BenchmarkName.__members__.get(benchmark)
+        if not benchmark_type:
+            raise APIAbort(
+                HTTPStatus.UNSUPPORTED_MEDIA_TYPE, f"Unsupported Benchmark: {benchmark}"
+            )
+
+        name = Dataset.stem(tarball.tarball_path)
+        try:
+            file = tarball.extract(tarball.tarball_path, f"{name}/result.csv")
+        except TarballUnpackError as e:
+            raise APIInternalError(str(e)) from e
+
+        get_quisby_data = QuisbyProcessing().extract_data(
+            benchmark_type, dataset.name, InputType.STREAM, file
+        )
+
+        if get_quisby_data["status"] != "success":
+            raise APIInternalError(
+                f"Quisby processing failure. Exception: {get_quisby_data['exception']}"
+            )
+        return jsonify(get_quisby_data)
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -49,7 +49,7 @@ def __init__(self, tarball: str):
         self.tarball = tarball
 
     def __str__(self) -> str:
-        return f"The dataset tarball named {self.tarball!r} is not present in the cache manager"
+        return f"The dataset tarball named {self.tarball!r} is not found"
 
 
 class DuplicateTarball(CacheManagerError):
@@ -59,7 +59,7 @@ def __init__(self, tarball: str):
         self.tarball = tarball
 
     def __str__(self) -> str:
-        return f"A dataset tarball named {self.tarball!r} is already present in the cache manager"
+        return f"A dataset tarball named {self.tarball!r} is already present"
 
 
 class MetadataError(CacheManagerError):

diff --git a/lib/pbench/test/unit/server/test_cache_manager.py b/lib/pbench/test/unit/server/test_cache_manager.py
@@ -236,7 +236,7 @@ def test_create_bad(
             cm.create(tarball.tarball_path)
         assert (
             str(exc.value)
-            == "A dataset tarball named 'pbench-user-benchmark_some + config_2021.05.01T12.42.42' is already present in the cache manager"
+            == "A dataset tarball named 'pbench-user-benchmark_some + config_2021.05.01T12.42.42' is already present"
         )
         assert tarball.metadata == fake_get_metadata(tarball.tarball_path)
         assert exc.value.tarball == tarball.name
@@ -924,10 +924,7 @@ def mock_run(args, **kwargs):
         assert tarball == cm[md5]
         with pytest.raises(TarballNotFound) as exc:
             cm["foobar"]
-        assert (
-            str(exc.value)
-            == "The dataset tarball named 'foobar' is not present in the cache manager"
-        )
+        assert str(exc.value) == "The dataset tarball named 'foobar' is not found"
 
         # Test __contains__
         assert md5 in cm
@@ -946,10 +943,7 @@ def mock_run(args, **kwargs):
         # Try to find a dataset that doesn't exist
         with pytest.raises(TarballNotFound) as exc:
             cm.find_dataset("foobar")
-        assert (
-            str(exc.value)
-            == "The dataset tarball named 'foobar' is not present in the cache manager"
-        )
+        assert str(exc.value) == "The dataset tarball named 'foobar' is not found"
         assert exc.value.tarball == "foobar"
 
         # Unpack the dataset, creating INCOMING and RESULTS links

diff --git a/lib/pbench/test/unit/server/test_datasets_inventory.py b/lib/pbench/test/unit/server/test_datasets_inventory.py
@@ -59,7 +59,7 @@ def test_get_no_dataset(self, query_get_as):
     def test_dataset_not_present(self, query_get_as):
         response = query_get_as("fio_2", "metadata.log", HTTPStatus.NOT_FOUND)
         assert response.json == {
-            "message": "The dataset tarball named 'random_md5_string4' is not present in the cache manager"
+            "message": "The dataset tarball named 'random_md5_string4' is not found"
         }
 
     def test_unauthorized_access(self, query_get_as):

diff --git a/lib/pbench/test/unit/server/test_datasets_visualize.py b/lib/pbench/test/unit/server/test_datasets_visualize.py
@@ -0,0 +1,127 @@
+from http import HTTPStatus
+from pathlib import Path
+
+from pquisby.lib.post_processing import QuisbyProcessing
+import pytest
+import requests
+
+from pbench.server import JSON
+from pbench.server.api.resources import ApiBase
+from pbench.server.cache_manager import CacheManager, Tarball
+from pbench.server.database.models.datasets import Dataset, DatasetNotFound
+
+
+class TestVisualize:
+    @pytest.fixture()
+    def query_get_as(self, client, server_config, more_datasets, get_token_func):
+        """
+        Helper fixture to perform the API query and validate an expected
+        return status.
+
+        Args:
+            client: Flask test API client fixture
+            server_config: Pbench config fixture
+            more_datasets: Dataset construction fixture
+            get_token_func: Pbench token fixture
+        """
+
+        def query_api(
+            dataset: str, user, expected_status: HTTPStatus
+        ) -> requests.Response:
+            try:
+                dataset_id = Dataset.query(name=dataset).resource_id
+            except DatasetNotFound:
+                dataset_id = dataset  # Allow passing deliberately bad value
+            headers = {"authorization": f"bearer {get_token_func(user)}"}
+            response = client.get(
+                f"{server_config.rest_uri}/datasets/{dataset_id}/visualize",
+                headers=headers,
+            )
+            assert response.status_code == expected_status
+            return response
+
+        return query_api
+
+    def mock_find_dataset(self, _dataset: str) -> Tarball:
+        class Tarball(object):
+            tarball_path = Path("/dataset/tarball.tar.xz")
+
+            def extract(_tarball_path: Path, _path: str) -> str:
+                return "CSV_file_as_a_byte_stream"
+
+        return Tarball
+
+    def mock_get_dataset_metadata(self, _dataset, _key) -> JSON:
+        return {"dataset.metalog.pbench.script": "uperf"}
+
+    def test_get_no_dataset(self, query_get_as):
+        response = query_get_as("nonexistent-dataset", "drb", HTTPStatus.NOT_FOUND)
+        assert response.json == {"message": "Dataset 'nonexistent-dataset' not found"}
+
+    def test_dataset_not_present(self, query_get_as):
+        response = query_get_as("fio_2", "drb", HTTPStatus.NOT_FOUND)
+        assert response.json == {
+            "message": "No dataset with ID 'random_md5_string4' found"
+        }
+
+    def test_unauthorized_access(self, query_get_as):
+        response = query_get_as("test", "drb", HTTPStatus.FORBIDDEN)
+        assert response.json == {
+            "message": "User drb is not authorized to READ a resource owned by test with private access"
+        }
+
+    def test_successful_get(self, query_get_as, monkeypatch):
+        def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON:
+            return {"status": "success", "json_data": "quisby_data"}
+
+        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(
+            ApiBase, "_get_dataset_metadata", self.mock_get_dataset_metadata
+        )
+        monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data)
+
+        response = query_get_as("uperf_1", "test", HTTPStatus.OK)
+        assert response.json["status"] == "success"
+        assert response.json["json_data"] == "quisby_data"
+
+    def test_unsuccessful_get_with_incorrect_data(self, query_get_as, monkeypatch):
+        def mock_find_dataset_with_incorrect_data(self, dataset) -> Tarball:
+            class Tarball(object):
+                tarball_path = Path("/dataset/tarball.tar.xz")
+
+                def extract(tarball_path, path) -> str:
+                    return "IncorrectData"
+
+            return Tarball
+
+        def mock_extract_data(self, test_name, dataset_name, input_type, data) -> JSON:
+            return {"status": "failed", "exception": "Unsupported Media Type"}
+
+        monkeypatch.setattr(
+            CacheManager, "find_dataset", mock_find_dataset_with_incorrect_data
+        )
+        monkeypatch.setattr(
+            ApiBase, "_get_dataset_metadata", self.mock_get_dataset_metadata
+        )
+        monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data)
+        response = query_get_as("uperf_1", "test", HTTPStatus.INTERNAL_SERVER_ERROR)
+        assert response.json["message"].startswith(
+            "Internal Pbench Server Error: log reference "
+        )
+
+    def test_unsupported_benchmark(self, query_get_as, monkeypatch):
+        flag = True
+
+        def mock_extract_data(*args, **kwargs) -> JSON:
+            nonlocal flag
+            flag = False
+
+        def mock_get_metadata(self, dataset, key) -> JSON:
+            return {"dataset.metalog.pbench.script": "hammerDB"}
+
+        monkeypatch.setattr(CacheManager, "find_dataset", self.mock_find_dataset)
+        monkeypatch.setattr(ApiBase, "_get_dataset_metadata", mock_get_metadata)
+        monkeypatch.setattr(QuisbyProcessing, "extract_data", mock_extract_data)
+        response = query_get_as("uperf_1", "test", HTTPStatus.UNSUPPORTED_MEDIA_TYPE)
+        assert response.json["message"] == "Unsupported Benchmark: HAMMERDB"
+        assert flag is True
diff --git a/lib/pbench/test/unit/server/test_endpoint_configure.py b/lib/pbench/test/unit/server/test_endpoint_configure.py
@@ -101,6 +101,10 @@ def check_config(self, client, server_config, host, my_headers={}):
                         "dataset_view": {"type": "string"},
                     },
                 },
+                "datasets_visualize": {
+                    "template": f"{uri}/datasets/{{dataset}}/visualize",
+                    "params": {"dataset": {"type": "string"}},
+                },
                 "endpoints": {"template": f"{uri}/endpoints", "params": {}},
                 "key": {
                     "template": f"{uri}/key/{{key}}",

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -20,4 +20,4 @@ python-dateutil
 requests # TODO CVE-2023-32681 (>=2.31.0)
 sdnotify
 sqlalchemy>=1.4.23
-sqlalchemy_utils>=0.37.6
+sqlalchemy_utils>=0.37.6