Skip to content

Commit

Permalink
Expose aggregate metadata namespace for UI (#3345)
Browse files Browse the repository at this point in the history
* Expose aggregate metadata namespace for UI

PBENCH-1091

Managing the display of metadata in the dashboard is complicated by the fact
that much of the namespace is highly dynamic: the `global` and `user` spaces
are completely defined by clients, and can be completely different for each
dataset, while the `dataset.metalog` namespace depends on the agent benchmark
parameters and postprocessing.

In order to provide a reference point, this PR proposes a way to acquire a
JSON document representing the aggregated namespace across a set of selected
datasets. It leverages the existing `GET /datasets` collection query for this
purpose, including all the filter keywords: however instead of returning a
list of matching datasets, it builds a JSON document showing the nested key
namespace.

A partial example follows:

```
GET /api/v1/datasets?keysummary=true
-----
{
    "dataset": {
        "access": null,
        "id": null,
        "metalog": {
            "iterations/0__linpack-binary=:root:linpack:xlinpack_xeon64": {
                "clients": null,
                "iteration_name": null,
                "iteration_number": null,
                "linpack-binary": null
            },
            [...]
        }
    }
}
```
  • Loading branch information
dbutenhof authored Mar 17, 2023
1 parent ab2ff68 commit 1f5cd39
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 36 deletions.
153 changes: 117 additions & 36 deletions lib/pbench/server/api/resources/datasets_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from http import HTTPStatus
from typing import Any
from urllib.parse import urlencode, urlparse

from flask import current_app
Expand Down Expand Up @@ -50,14 +51,24 @@ def __init__(self, config: PbenchServerConfig):
ApiMethod.GET,
OperationCode.READ,
query_schema=Schema(
# Filter criteria
Parameter("mine", ParamType.BOOLEAN),
Parameter("name", ParamType.STRING),
Parameter("owner", ParamType.USER),
Parameter("access", ParamType.ACCESS),
Parameter("start", ParamType.DATE),
Parameter("end", ParamType.DATE),
Parameter(
"filter",
ParamType.LIST,
element_type=ParamType.STRING,
string_list=",",
),
# Pagination
Parameter("offset", ParamType.INT),
Parameter("limit", ParamType.INT),
# Output control
Parameter("keysummary", ParamType.BOOLEAN),
Parameter(
"metadata",
ParamType.LIST,
Expand All @@ -66,12 +77,6 @@ def __init__(self, config: PbenchServerConfig):
key_path=True,
string_list=",",
),
Parameter(
"filter",
ParamType.LIST,
element_type=ParamType.STRING,
string_list=",",
),
),
authorization=ApiAuthorizationType.USER_ACCESS,
),
Expand Down Expand Up @@ -289,6 +294,108 @@ def filter_query(filters: list[str], query: Query) -> Query:

return query.filter(and_(*and_list))

def accumulate(self, aggregate: JSONOBJECT, key: str, value: Any):
"""Recursive helper to accumulate the metadata namespace
Iterate through a list of metadata key/value pairs to construct a
hierarchical aggregation of all metadata keys across the selected
datasets. Each key in the hierarchy is represented as a key in a
nested JSON object. "Leaf" keys have the value None. E.g.,
{
"dataset": {"name": None, "metalog": {"pbench": {"script": None}}},
"server": {"deletion": None, "tarball-path": None},
"global": {"server": {"legacy": {"sha1": None}}}
}
Args:
aggregate: a JSONOBJECT to update with the recursive key/value
key: the current metadata key path element
value: the current metadata key's value
"""
if isinstance(value, dict):
p = aggregate.get(key)
if p is None:
p = {}
aggregate[key] = p
for k, v in value.items():
self.accumulate(p, k, v)
elif key not in aggregate:
aggregate[key] = None

def keyspace(self, query: Query) -> JSONOBJECT:
"""Aggregate the dataset metadata keyspace
Run the query we've compiled, but instead of returning Dataset proxies,
we only want the metadata key/value pairs we've selected.
NOTE: The SQL left outer join returns a row for each row in the "left"
table (Dataset) even if there is no matching foreign key in the "right"
table (Metadata). This means a dataset with no metadata will result in
a join row here with key and value of None. The `elif` in the loop will
silently ignore rows with a null key to handle this case.
Args:
query: The basic filtered SQLAlchemy query object
Returns:
The aggregated keyspace JSON object
"""
aggregate: JSONOBJECT = {
"dataset": {c.name: None for c in Dataset.__table__._columns}
}
list = query.with_entities(Metadata.key, Metadata.value).all()
for k, v in list:
# "metalog" is a top-level key in the Metadata schema, but we
# report it as a sub-key of "dataset".
if k == Metadata.METALOG:
self.accumulate(aggregate["dataset"], k, v)
elif k:
self.accumulate(aggregate, k, v)
return aggregate

def datasets(self, request: Request, json: JSONOBJECT, query: Query) -> JSONOBJECT:
"""Gather and paginate the selected datasets
Run the query we've compiled, with pagination limits applied; collect
results into a list of JSON objects including selected metadata keys.
Args:
request: The HTTP Request object
json: The JSON query parameters
query: The basic filtered SQLAlchemy query object
Returns:
The paginated dataset listing
"""
try:
datasets, paginated_result = self.get_paginated_obj(
query=query, json=json, url=request.url
)
except (AttributeError, ProgrammingError, StatementError) as e:
raise APIInternalError(
f"Constructed SQL for {json} isn't executable"
) from e
except Exception as e:
raise APIInternalError(f"Unexpected SQL exception: {e}") from e

keys = json.get("metadata")

response = []
for dataset in datasets:
d = {
"name": dataset.name,
"resource_id": dataset.resource_id,
}
try:
d["metadata"] = self._get_dataset_metadata(dataset, keys)
except MetadataError:
d["metadata"] = None
response.append(d)

paginated_result["results"] = response
return paginated_result

def _get(
self, params: ApiParams, request: Request, context: ApiContext
) -> Response:
Expand Down Expand Up @@ -346,33 +453,7 @@ def _get(
else:
owner = json.get("owner")
query = self._build_sql_query(owner, json.get("access"), query)

try:
datasets, paginated_result = self.get_paginated_obj(
query=query, json=json, url=request.url
)
except (AttributeError, ProgrammingError, StatementError) as e:
raise APIInternalError(
f"Constructed SQL for {json} isn't executable"
) from e
except Exception as e:
raise APIInternalError(f"Unexpected SQL exception: {e}") from e

keys = json.get("metadata")

response = []
for dataset in datasets:
d = {
"name": dataset.name,
"resource_id": dataset.resource_id,
}
try:
d["metadata"] = self._get_dataset_metadata(dataset, keys)
except MetadataError as e:
current_app.logger.warning(
"Error getting metadata {} for dataset {}: {}", keys, dataset, e
)
response.append(d)

paginated_result["results"] = response
return jsonify(paginated_result)
if json.get("keysummary"):
return jsonify(self.keyspace(query))
else:
return jsonify(self.datasets(request, json, query))
91 changes: 91 additions & 0 deletions lib/pbench/test/unit/server/test_datasets_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,43 @@ def test_get_bad_keys(self, query_as):
)
}

def test_get_get_errors(self, server_config, query_as):
"""Test case reporting key errors
Args:
query_as: Query helper fixture
"""
fio_1 = Dataset.query(name="fio_1")
fio_2 = Dataset.query(name="fio_2")
Metadata.setvalue(dataset=fio_1, key="global.test", value="ABC")
Metadata.setvalue(dataset=fio_2, key="global.test.foo", value="ABC")
response = query_as(
{"metadata": "global.test.foo"},
"drb",
HTTPStatus.OK,
)
assert response.json == {
"next_url": "",
"results": [
{
"metadata": {"global.test.foo": None},
"name": "drb",
"resource_id": "random_md5_string1",
},
{
"metadata": {"global.test.foo": None},
"name": "fio_1",
"resource_id": "random_md5_string3",
},
{
"metadata": {"global.test.foo": "ABC"},
"name": "fio_2",
"resource_id": "random_md5_string4",
},
],
"total": 3,
}

def test_get_unknown_keys(self, query_as):
"""Test case requesting non-existent query parameter keys.
Expand Down Expand Up @@ -496,3 +533,57 @@ def do_error(
if key in m:
assert error in m
break

def test_key_summary(self, query_as):
"""Test keyspace summary.
With the `keysummary` query parameter, /datasets returns an aggregation
of defined metadata key namespaces for the selected datasets.
We add a few metadata kays to the ones provided by the fixture to show
aggregation across multiple selected datasets. Note that without filter
criteria, the query here should return drb's "drb" and "fio_1" datasets
and test's public "fio_2" dataset.
"""
drb = Dataset.query(name="drb")
fio_1 = Dataset.query(name="fio_1")

# Make sure we aggregate distinct namespaces across the three visible
# datasets by setting some varied keys. We leave fio_2 "pristine" to
# prove that the aggregator doesn't fail when we find no metadata for
# a dataset. We deliberately create the conflicting "global.legacy"
# and "global.legacy.server" to show that the conflict doesn't cause
# a problem.
Metadata.setvalue(dataset=drb, key="global.legacy", value="Truish")
Metadata.setvalue(dataset=fio_1, key="server.origin", value="SAT")
Metadata.setvalue(dataset=fio_1, key="global.legacy.server", value="ABC")
response = query_as({"keysummary": "true"}, "drb", HTTPStatus.OK)
assert response.json == {
"dataset": {
"access": None,
"id": None,
"metalog": {
"pbench": {
"config": None,
"date": None,
"name": None,
"script": None,
},
"run": {"controller": None},
},
"name": None,
"owner_id": None,
"resource_id": None,
"uploaded": None,
},
"global": {"contact": None, "legacy": {"server": None}},
"server": {
"deletion": None,
"index-map": {
"unit-test.v5.result-data-sample.2020-08": None,
"unit-test.v6.run-data.2020-08": None,
"unit-test.v6.run-toc.2020-05": None,
},
"origin": None,
},
}

0 comments on commit 1f5cd39

Please sign in to comment.