Expose aggregate metadata namespace for UI (#3345)

* Expose aggregate metadata namespace for UI PBENCH-1091 Managing the display of metadata in the dashboard is complicated by the fact that much of the namespace is highly dynamic: the `global` and `user` spaces are completely defined by clients, and can be completely different for each dataset, while the `dataset.metalog` namespace depends on the agent benchmark parameters and postprocessing. In order to provide a reference point, this PR proposes a way to acquire a JSON document representing the aggregated namespace across a set of selected datasets. It leverages the existing `GET /datasets` collection query for this purpose, including all the filter keywords: however instead of returning a list of matching datasets, it builds a JSON document showing the nested key namespace. A partial example follows: ``` GET /api/v1/datasets?keysummary=true ----- { "dataset": { "access": null, "id": null, "metalog": { "iterations/0__linpack-binary=:root:linpack:xlinpack_xeon64": { "clients": null, "iteration_name": null, "iteration_number": null, "linpack-binary": null }, [...] } } } ```
distributed-system-analysis · Mar 17, 2023 · 1f5cd39 · 1f5cd39
1 parent ab2ff68
commit 1f5cd39
Show file tree

Hide file tree

Showing 2 changed files with 208 additions and 36 deletions.
diff --git a/lib/pbench/server/api/resources/datasets_list.py b/lib/pbench/server/api/resources/datasets_list.py
@@ -1,4 +1,5 @@
 from http import HTTPStatus
+from typing import Any
 from urllib.parse import urlencode, urlparse
 
 from flask import current_app
@@ -50,14 +51,24 @@ def __init__(self, config: PbenchServerConfig):
                 ApiMethod.GET,
                 OperationCode.READ,
                 query_schema=Schema(
+                    # Filter criteria
                     Parameter("mine", ParamType.BOOLEAN),
                     Parameter("name", ParamType.STRING),
                     Parameter("owner", ParamType.USER),
                     Parameter("access", ParamType.ACCESS),
                     Parameter("start", ParamType.DATE),
                     Parameter("end", ParamType.DATE),
+                    Parameter(
+                        "filter",
+                        ParamType.LIST,
+                        element_type=ParamType.STRING,
+                        string_list=",",
+                    ),
+                    # Pagination
                     Parameter("offset", ParamType.INT),
                     Parameter("limit", ParamType.INT),
+                    # Output control
+                    Parameter("keysummary", ParamType.BOOLEAN),
                     Parameter(
                         "metadata",
                         ParamType.LIST,
@@ -66,12 +77,6 @@ def __init__(self, config: PbenchServerConfig):
                         key_path=True,
                         string_list=",",
                     ),
-                    Parameter(
-                        "filter",
-                        ParamType.LIST,
-                        element_type=ParamType.STRING,
-                        string_list=",",
-                    ),
                 ),
                 authorization=ApiAuthorizationType.USER_ACCESS,
             ),
@@ -289,6 +294,108 @@ def filter_query(filters: list[str], query: Query) -> Query:
 
         return query.filter(and_(*and_list))
 
+    def accumulate(self, aggregate: JSONOBJECT, key: str, value: Any):
+        """Recursive helper to accumulate the metadata namespace
+
+        Iterate through a list of metadata key/value pairs to construct a
+        hierarchical aggregation of all metadata keys across the selected
+        datasets. Each key in the hierarchy is represented as a key in a
+        nested JSON object. "Leaf" keys have the value None. E.g.,
+
+            {
+                "dataset": {"name": None, "metalog": {"pbench": {"script": None}}},
+                "server": {"deletion": None, "tarball-path": None},
+                "global": {"server": {"legacy": {"sha1": None}}}
+            }
+
+        Args:
+            aggregate: a JSONOBJECT to update with the recursive key/value
+            key: the current metadata key path element
+            value: the current metadata key's value
+        """
+        if isinstance(value, dict):
+            p = aggregate.get(key)
+            if p is None:
+                p = {}
+                aggregate[key] = p
+            for k, v in value.items():
+                self.accumulate(p, k, v)
+        elif key not in aggregate:
+            aggregate[key] = None
+
+    def keyspace(self, query: Query) -> JSONOBJECT:
+        """Aggregate the dataset metadata keyspace
+
+        Run the query we've compiled, but instead of returning Dataset proxies,
+        we only want the metadata key/value pairs we've selected.
+
+        NOTE: The SQL left outer join returns a row for each row in the "left"
+        table (Dataset) even if there is no matching foreign key in the "right"
+        table (Metadata). This means a dataset with no metadata will result in
+        a join row here with key and value of None. The `elif` in the loop will
+        silently ignore rows with a null key to handle this case.
+
+        Args:
+            query: The basic filtered SQLAlchemy query object
+
+        Returns:
+            The aggregated keyspace JSON object
+        """
+        aggregate: JSONOBJECT = {
+            "dataset": {c.name: None for c in Dataset.__table__._columns}
+        }
+        list = query.with_entities(Metadata.key, Metadata.value).all()
+        for k, v in list:
+            # "metalog" is a top-level key in the Metadata schema, but we
+            # report it as a sub-key of "dataset".
+            if k == Metadata.METALOG:
+                self.accumulate(aggregate["dataset"], k, v)
+            elif k:
+                self.accumulate(aggregate, k, v)
+        return aggregate
+
+    def datasets(self, request: Request, json: JSONOBJECT, query: Query) -> JSONOBJECT:
+        """Gather and paginate the selected datasets
+
+        Run the query we've compiled, with pagination limits applied; collect
+        results into a list of JSON objects including selected metadata keys.
+
+        Args:
+            request: The HTTP Request object
+            json: The JSON query parameters
+            query: The basic filtered SQLAlchemy query object
+
+        Returns:
+            The paginated dataset listing
+        """
+        try:
+            datasets, paginated_result = self.get_paginated_obj(
+                query=query, json=json, url=request.url
+            )
+        except (AttributeError, ProgrammingError, StatementError) as e:
+            raise APIInternalError(
+                f"Constructed SQL for {json} isn't executable"
+            ) from e
+        except Exception as e:
+            raise APIInternalError(f"Unexpected SQL exception: {e}") from e
+
+        keys = json.get("metadata")
+
+        response = []
+        for dataset in datasets:
+            d = {
+                "name": dataset.name,
+                "resource_id": dataset.resource_id,
+            }
+            try:
+                d["metadata"] = self._get_dataset_metadata(dataset, keys)
+            except MetadataError:
+                d["metadata"] = None
+            response.append(d)
+
+        paginated_result["results"] = response
+        return paginated_result
+
     def _get(
         self, params: ApiParams, request: Request, context: ApiContext
     ) -> Response:
@@ -346,33 +453,7 @@ def _get(
         else:
             owner = json.get("owner")
         query = self._build_sql_query(owner, json.get("access"), query)
-
-        try:
-            datasets, paginated_result = self.get_paginated_obj(
-                query=query, json=json, url=request.url
-            )
-        except (AttributeError, ProgrammingError, StatementError) as e:
-            raise APIInternalError(
-                f"Constructed SQL for {json} isn't executable"
-            ) from e
-        except Exception as e:
-            raise APIInternalError(f"Unexpected SQL exception: {e}") from e
-
-        keys = json.get("metadata")
-
-        response = []
-        for dataset in datasets:
-            d = {
-                "name": dataset.name,
-                "resource_id": dataset.resource_id,
-            }
-            try:
-                d["metadata"] = self._get_dataset_metadata(dataset, keys)
-            except MetadataError as e:
-                current_app.logger.warning(
-                    "Error getting metadata {} for dataset {}: {}", keys, dataset, e
-                )
-            response.append(d)
-
-        paginated_result["results"] = response
-        return jsonify(paginated_result)
+        if json.get("keysummary"):
+            return jsonify(self.keyspace(query))
+        else:
+            return jsonify(self.datasets(request, json, query))
diff --git a/lib/pbench/test/unit/server/test_datasets_list.py b/lib/pbench/test/unit/server/test_datasets_list.py
@@ -345,6 +345,43 @@ def test_get_bad_keys(self, query_as):
             )
         }
 
+    def test_get_get_errors(self, server_config, query_as):
+        """Test case reporting key errors
+
+        Args:
+            query_as: Query helper fixture
+        """
+        fio_1 = Dataset.query(name="fio_1")
+        fio_2 = Dataset.query(name="fio_2")
+        Metadata.setvalue(dataset=fio_1, key="global.test", value="ABC")
+        Metadata.setvalue(dataset=fio_2, key="global.test.foo", value="ABC")
+        response = query_as(
+            {"metadata": "global.test.foo"},
+            "drb",
+            HTTPStatus.OK,
+        )
+        assert response.json == {
+            "next_url": "",
+            "results": [
+                {
+                    "metadata": {"global.test.foo": None},
+                    "name": "drb",
+                    "resource_id": "random_md5_string1",
+                },
+                {
+                    "metadata": {"global.test.foo": None},
+                    "name": "fio_1",
+                    "resource_id": "random_md5_string3",
+                },
+                {
+                    "metadata": {"global.test.foo": "ABC"},
+                    "name": "fio_2",
+                    "resource_id": "random_md5_string4",
+                },
+            ],
+            "total": 3,
+        }
+
     def test_get_unknown_keys(self, query_as):
         """Test case requesting non-existent query parameter keys.
 
@@ -496,3 +533,57 @@ def do_error(
             if key in m:
                 assert error in m
                 break
+
+    def test_key_summary(self, query_as):
+        """Test keyspace summary.
+
+        With the `keysummary` query parameter, /datasets returns an aggregation
+        of defined metadata key namespaces for the selected datasets.
+
+        We add a few metadata kays to the ones provided by the fixture to show
+        aggregation across multiple selected datasets. Note that without filter
+        criteria, the query here should return drb's "drb" and "fio_1" datasets
+        and test's public "fio_2" dataset.
+        """
+        drb = Dataset.query(name="drb")
+        fio_1 = Dataset.query(name="fio_1")
+
+        # Make sure we aggregate distinct namespaces across the three visible
+        # datasets by setting some varied keys. We leave fio_2 "pristine" to
+        # prove that the aggregator doesn't fail when we find no metadata for
+        # a dataset. We deliberately create the conflicting "global.legacy"
+        # and "global.legacy.server" to show that the conflict doesn't cause
+        # a problem.
+        Metadata.setvalue(dataset=drb, key="global.legacy", value="Truish")
+        Metadata.setvalue(dataset=fio_1, key="server.origin", value="SAT")
+        Metadata.setvalue(dataset=fio_1, key="global.legacy.server", value="ABC")
+        response = query_as({"keysummary": "true"}, "drb", HTTPStatus.OK)
+        assert response.json == {
+            "dataset": {
+                "access": None,
+                "id": None,
+                "metalog": {
+                    "pbench": {
+                        "config": None,
+                        "date": None,
+                        "name": None,
+                        "script": None,
+                    },
+                    "run": {"controller": None},
+                },
+                "name": None,
+                "owner_id": None,
+                "resource_id": None,
+                "uploaded": None,
+            },
+            "global": {"contact": None, "legacy": {"server": None}},
+            "server": {
+                "deletion": None,
+                "index-map": {
+                    "unit-test.v5.result-data-sample.2020-08": None,
+                    "unit-test.v6.run-data.2020-08": None,
+                    "unit-test.v6.run-toc.2020-05": None,
+                },
+                "origin": None,
+            },
+        }