From 7bb37937e2cd806484d472165090acb64228a47c Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 3 May 2024 18:49:53 -0700 Subject: [PATCH 1/2] Cover nested project facets in unit test --- test/service/test_response.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/service/test_response.py b/test/service/test_response.py index 1c6de3a408..1fad76d80b 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -3498,6 +3498,12 @@ def test_projects_response(self): self.assertEqual(expected_bionetwork_name, project['bionetworkName']) self.assertTrue(project['isTissueAtlasProject']) + tissue_atlas = { + entry['term']: entry['count'] + for entry in response_json['termFacets']['tissueAtlas']['terms'] + } + self.assertEqual({'Lung': 1, 'Retina': 1, 'Blood': 1}, tissue_atlas) + class TestUnpopulatedIndexResponse(IndexResponseTestCase): From 78573e46cddcc77213a66691b39f9b675cac33fa Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Mon, 22 Apr 2024 03:16:00 -0700 Subject: [PATCH 2/2] [a] Aggregate `Nested` fields by one property (#5519) --- lambdas/service/app.py | 2 +- lambdas/service/openapi.json | 2 +- src/azul/indexer/document.py | 3 +++ src/azul/indexer/document_service.py | 8 +++++++- src/azul/service/elasticsearch_service.py | 3 +++ test/service/test_request_builder.py | 14 +++++++++++++- 6 files changed, 28 insertions(+), 4 deletions(-) diff --git a/lambdas/service/app.py b/lambdas/service/app.py index 31ba174b86..ee1077d0e1 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -231,7 +231,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '7.3' + 'version': '7.4' }, 'tags': [ { diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index 972e073079..afbeedf426 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -3,7 +3,7 @@ "info": { "title": "azul_service", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n", - "version": "7.3" + "version": "7.4" }, "tags": [ { diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index bf06473db7..db57cad72f 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -29,6 +29,7 @@ import attr from more_itertools import ( + first, one, ) @@ -990,9 +991,11 @@ def from_index(self, value: str) -> Optional[str]: class Nested(PassThrough[JSON]): properties: Mapping[str, FieldType] + agg_property: str def __init__(self, **properties): super().__init__(JSON, es_type='nested') + self.agg_property = first(properties.keys()) self.properties = properties def api_filter_schema(self, relation: str) -> JSON: diff --git a/src/azul/indexer/document_service.py b/src/azul/indexer/document_service.py index 80cb51bc20..3f0e7e0ef5 100644 --- a/src/azul/indexer/document_service.py +++ b/src/azul/indexer/document_service.py @@ -24,6 +24,7 @@ Document, FieldType, FieldTypes, + Nested, ) from azul.indexer.transform import ( Transformer, @@ -75,7 +76,12 @@ def field_type(self, catalog: CatalogName, path: FieldPath) -> FieldType: try: field_types = field_types[element] except (KeyError, TypeError) as e: - raise type(e)('Path not represented in field_types', path) + if isinstance(field_types, list): + field_types = one(field_types) + if isinstance(field_types, Nested) and element == field_types.agg_property: + field_types = field_types.properties[element] + else: + raise type(e)('Path not represented in field_types', path) if isinstance(field_types, list): field_types = one(field_types) return field_types diff --git a/src/azul/service/elasticsearch_service.py b/src/azul/service/elasticsearch_service.py index 7b50f90940..b36665be89 100644 --- a/src/azul/service/elasticsearch_service.py +++ b/src/azul/service/elasticsearch_service.py @@ -319,6 +319,9 @@ def _prepare_aggregation(self, *, facet: str, facet_path: FieldPath) -> Agg: query = self.filter_stage.prepare_query(skip_field_paths=(facet_path,)) agg = A('filter', query) + field_type = self.service.field_type(self.catalog, facet_path) + if isinstance(field_type, Nested): + facet_path = dotted(facet_path, field_type.agg_property) # Make an inner agg that will contain the terms in question path = dotted(facet_path, 'keyword') # FIXME: Approximation errors for terms aggregation are unchecked diff --git a/test/service/test_request_builder.py b/test/service/test_request_builder.py index 195634d54c..218f964313 100644 --- a/test/service/test_request_builder.py +++ b/test/service/test_request_builder.py @@ -9,6 +9,10 @@ from azul import ( CatalogName, ) +from azul.indexer.document import ( + FieldTypes, + null_str, +) from azul.logging import ( configure_test_logging, ) @@ -373,6 +377,14 @@ def test_create_aggregate(self): } } + class Service(self.Service): + + def field_types(self, catalog: CatalogName) -> FieldTypes: + return { + **super().field_types(catalog), + 'path': {'to': {'foo': null_str}} + } + class MockPlugin(self.MockPlugin): @property @@ -386,7 +398,7 @@ def field_mapping(self) -> Mapping[str, FieldPath]: def facets(self) -> Sequence[str]: return ['foo'] - service = self.Service(MockPlugin()) + service = Service(MockPlugin()) filters = Filters(explicit={}, source_ids=set()) post_filter = True