diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index c4e0edcc3d..292a40b058 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -156,6 +156,10 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping: 'registered_identifier', 'title', 'data_modality', + # This field path has a brittle coupling that must be + # maintained to the field lookup in + # `self.manifest_config`. + 'duos_id', ] }, 'donors': { @@ -284,6 +288,11 @@ def manifest_config(self) -> ManifestConfig: # the fields listed here and those used in `self._field_mapping`. fields_to_omit_from_manifest = [ ('contents', 'activities', 'activity_table'), + # We omit the `duos_id` field from manifests since there is only one + # DUOS bundle per dataset, and that bundle only contributes to outer + # entities of the `datasets` type, not to entities of the other + # types, such as files, which the manifest is generated from. + ('contents', 'datasets', 'duos_id'), ('contents', 'files', 'uuid'), ('contents', 'files', 'version'), ] @@ -351,6 +360,10 @@ def verbatim_pfb_schema(self, is_polymorphic=is_duos_type) ] if is_duos_type: + field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, + column_name='duos_id', + anvil_datatype='string', + is_polymorphic=True)) field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name, column_name='description', anvil_datatype='string', diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 461e504f32..32fc7c96c4 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -500,13 +500,14 @@ def _duos_types(cls) -> FieldTypes: return { 'document_id': null_str, 'description': null_str, + 'duos_id': null_str, } def _duos(self, dataset: EntityReference) -> MutableJSON: return self._entity(dataset, self._duos_types()) def _is_duos(self, dataset: EntityReference) -> bool: - return 'description' in self.bundle.entities[dataset] + return 'duos_id' in self.bundle.entities[dataset] def _dataset(self, dataset: EntityReference) -> MutableJSON: if self._is_duos(dataset): diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index 8d0be3c129..6175bd6473 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -210,6 +210,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]: }, 'datasets': { 'dataset_id', + 'duos_id', 'title' }, 'diagnoses': { diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index b376636400..6b489a98d5 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -126,16 +126,20 @@ class BundleType(Enum): supplementary bundle to emit contributions for them, hence we treat them as orphans. - DUOS bundles consist of a single dataset entity. This "entity" includes only - the dataset description retrieved from DUOS, while a copy of the BigQuery - row for this dataset is also included as an orphan. We chose this design - because there is only one dataset per snapshot, which is referenced in all - bundles. Therefore, only one request to DUOS per *snapshot* is necessary. If - the DUOS `description` were retrieved at the same time as the other fields - of the dataset entity, we would make one request per *bundle* instead, - potentially overloading the DUOS service. Our solution is to retrieve - `description` only in a bundle of this dedicated DUOS type, once per - snapshot, and merge it with the other dataset fields during aggregation. + DUOS bundles consist of a single dataset entity. This "entity" includes the + DUOS ID retrieved from TDR and dataset description retrieved from DUOS, + while a copy of the BigQuery row for this dataset is also included as an + orphan. We chose this design because there is only one dataset per snapshot, + which is referenced in all bundles. Therefore, only one request to DUOS per + *snapshot* is necessary. If the DUOS `description` were retrieved at the + same time as the other fields of the dataset entity, we would make one + request per *bundle* instead, potentially overloading the DUOS service. Our + solution is to retrieve `description` only in a bundle of this dedicated + DUOS type, once per snapshot, and merge it with the other dataset fields + during aggregation. As a result, `duos_id` cannot be included in file + manifests since there is only one DUOS bundle per dataset, and that bundle + only contributes to outer entities of the `datasets` type, not to entities + of the other types, such as files, which the manifest is generated from. All other bundles are replica bundles. Replica bundles consist of a batch of rows from an arbitrary BigQuery table, which may or may not be described by @@ -479,7 +483,7 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: assert not bundle_fqid.is_batched, bundle_fqid - duos_info = self.tdr.get_duos(bundle_fqid.source) + duos_id, duos_info = self.tdr.get_duos(bundle_fqid.source) description = None if duos_info is None else duos_info.get('studyDescription') ref, row = self._get_dataset(bundle_fqid.source.spec) expected_entity_id = change_version(bundle_fqid.uuid, @@ -487,7 +491,8 @@ def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: self.datarepo_row_uuid_version) assert ref.entity_id == expected_entity_id, (ref, bundle_fqid) bundle = TDRAnvilBundle(fqid=bundle_fqid) - bundle.add_entity(ref, self._version, {'description': description}) + entity_row = {'duos_id': duos_id, 'description': description} + bundle.add_entity(ref, self._version, entity_row) # Classify as orphan to suppress the emission of a contribution bundle.add_entity(ref, self._version, dict(row), is_orphan=True) return bundle diff --git a/src/azul/terra.py b/src/azul/terra.py index e5046850a7..fdc61e745f 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -646,19 +646,25 @@ def for_registered_user(cls, authentication: OAuth2) -> 'TDRClient': def drs_client(self) -> DRSClient: return DRSClient(http_client=self._http_client) - def get_duos(self, source: TDRSourceRef) -> Optional[MutableJSON]: + def get_duos(self, + source: TDRSourceRef + ) -> tuple[str, MutableJSON] | tuple[None, None]: response = self._retrieve_source(source) try: duos_id = response['duosFirecloudGroup']['duosId'] except (KeyError, TypeError): log.warning('No DUOS ID available for %r', source.spec) - return None + return None, None else: url = self._duos_endpoint('dataset', 'registration', duos_id) response = self._request('GET', url) if response.status == 404: log.warning('No DUOS dataset registration with ID %r from %r', duos_id, source.spec) - return None + return None, None else: - return self._check_response(url, response) + response = self._check_response(url, response) + consent_group = one(response['consentGroups']) + require(duos_id == consent_group['datasetIdentifier'], + 'Mismatched identifiers', duos_id, consent_group) + return duos_id, response diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json index 9859b200f1..b029436606 100644 --- a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json +++ b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json @@ -2,6 +2,7 @@ "entities": { "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "version": "2022-06-01T00:00:00.000000Z" } }, diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 1cbc52ba0e..e6f0114dd0 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -75,7 +75,7 @@ def setUpClass(cls) -> None: mock_duos_url = furl('https:://mock_duos.lan') - duos_id = 'foo' + duos_id = 'DUOS-000000' duos_description = 'Study description from DUOS' @classmethod @@ -93,6 +93,9 @@ def _patch_duos(cls) -> None: } })), Mock(spec=HTTPResponse, status=200, data=json.dumps({ + 'consentGroups': [{ + 'datasetIdentifier': cls.duos_id + }], 'studyDescription': cls.duos_description })) ])) @@ -251,8 +254,9 @@ def test_dataset_description(self): # These fields are populated only in the primary bundle self.assertEqual(dataset_ref.entity_id, contents['document_id']) self.assertEqual(['phs000693'], contents['registered_identifier']) - # This field is populated only in the DUOS bundle + # These fields are populated only in the DUOS bundle self.assertEqual('Study description from DUOS', contents['description']) + self.assertEqual('DUOS-000000', contents['duos_id']) else: self.fail(qualifier) self.assertDictEqual(doc_counts, { diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json index 8cb9a00eda..145153fcb3 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json @@ -110,6 +110,7 @@ "datarepo_row_id": null, "dataset_id": null, "description": "Study description from DUOS", + "duos_id": "DUOS-000000", "owner": null, "principal_investigator": null, "registered_identifier": null, @@ -282,6 +283,7 @@ "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", "description": null, + "duos_id": null, "owner": [ "Debbie Nickerson" ], diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json index 9bdd6fcf66..1f0d38f6f7 100644 --- a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json +++ b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json @@ -560,6 +560,14 @@ "string" ] }, + { + "name": "duos_id", + "namespace": "anvil_dataset", + "type": [ + "null", + "string" + ] + }, { "name": "owner", "namespace": "anvil_dataset", diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 8fe99a5893..7bdae5e620 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1760,6 +1760,10 @@ def bundles(cls) -> list[SourcedBundleFQID]: def test_compact_manifest(self): response = self._get_manifest(ManifestFormat.compact, filters={}) self.assertEqual(200, response.status_code) + # The `duos_id` field is absent from manifests since there is only one + # DUOS bundle per dataset, and that bundle only contributes to outer + # entities of the `datasets` type, not to entities of the other types, + # such as files, which the manifest is generated from. expected = [ ( 'bundles.bundle_uuid', diff --git a/test/service/test_response_anvil.py b/test/service/test_response_anvil.py index 2d6fb44ab6..003bea7bc2 100644 --- a/test/service/test_response_anvil.py +++ b/test/service/test_response_anvil.py @@ -36,7 +36,7 @@ def tearDownClass(cls): @classmethod def bundles(cls) -> list[TDRAnvilBundleFQID]: - return [cls.primary_bundle()] + return [cls.primary_bundle(), cls.duos_bundle()] def test_entity_indices(self): self.maxDiff = None @@ -881,6 +881,30 @@ def test_entity_indices(self): }, 'bundles': { 'hits': [ + { + 'activities': [], + 'biosamples': [], + 'bundles': [ + { + 'accessible': True, + 'bundle_uuid': '2370f948-2783-aeb6-afea-e022897f4dcf', + 'bundle_version': '2022-06-01T00:00:00.000000Z' + } + ], + 'datasets': [ + {'duos_id': ['DUOS-000000']} + ], + 'diagnoses': [], + 'donors': [], + 'entryId': '2370f948-2783-aeb6-afea-e022897f4dcf', + 'files': [], + 'sources': [ + { + 'source_id': '6c87f0e1-509d-46a4-b845-7584df39263b', + 'source_spec': 'tdr:bigquery:gcp:test_anvil_project:anvil_snapshot:/2' + } + ] + }, { 'activities': [ { @@ -959,125 +983,157 @@ def test_entity_indices(self): } ], 'pagination': { - 'count': 1, + 'count': 2, 'next': None, 'order': 'asc', 'pages': 1, 'previous': None, 'size': 10, 'sort': 'bundle_uuid', - 'total': 1 + 'total': 2 }, 'termFacets': { 'accessible': { - 'terms': [{'count': 1, 'term': 'true'}], - 'total': 1, + 'terms': [{'count': 2, 'term': 'true'}], + 'total': 2, 'type': 'terms' }, 'activities.activity_type': { - 'terms': [{'count': 1, 'term': 'Sequencing'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'Sequencing'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'activities.assay_type': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'activities.data_modality': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'biosamples.anatomical_site': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'biosamples.biosample_type': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'biosamples.disease': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'datasets.consent_group': { - 'terms': [{'count': 1, 'term': 'DS-BDIS'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'DS-BDIS'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'datasets.data_use_permission': { - 'terms': [{'count': 1, 'term': 'DS-BDIS'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'DS-BDIS'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'datasets.registered_identifier': { - 'terms': [{'count': 1, 'term': 'phs000693'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'phs000693'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'datasets.title': { - 'terms': [{'count': 1, 'term': 'ANVIL_CMG_UWASH_DS_BDIS'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'ANVIL_CMG_UWASH_DS_BDIS'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'diagnoses.disease': { 'terms': [ {'count': 1, 'term': 'redacted-A61iJlLx'}, - {'count': 1, 'term': 'redacted-g50ublm/'} + {'count': 1, 'term': 'redacted-g50ublm/'}, + {'count': 1, 'term': None} ], - 'total': 1, + 'total': 2, 'type': 'terms' }, 'diagnoses.phenopacket': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'diagnoses.phenotype': { - 'terms': [{'count': 1, 'term': 'redacted-acSYHZUr'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'redacted-acSYHZUr'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'donors.organism_type': { - 'terms': [{'count': 1, 'term': 'redacted-ACw+6ecI'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'redacted-ACw+6ecI'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'donors.phenotypic_sex': { - 'terms': [{'count': 1, 'term': 'redacted-JfQ0b3xG'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'redacted-JfQ0b3xG'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'donors.reported_ethnicity': { - 'terms': [{'count': 1, 'term': 'redacted-NSkwDycK'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'redacted-NSkwDycK'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'files.data_modality': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' }, 'files.file_format': { 'terms': [ {'count': 1, 'term': '.bam'}, - {'count': 1, 'term': '.vcf.gz'} + {'count': 1, 'term': '.vcf.gz'}, + {'count': 1, 'term': None} ], - 'total': 1, + 'total': 2, 'type': 'terms' }, 'files.is_supplementary': { - 'terms': [{'count': 1, 'term': 'false'}], - 'total': 1, + 'terms': [ + {'count': 1, 'term': 'false'}, + {'count': 1, 'term': None} + ], + 'total': 2, 'type': 'terms' }, 'files.reference_assembly': { - 'terms': [{'count': 1, 'term': None}], - 'total': 1, + 'terms': [{'count': 2, 'term': None}], + 'total': 2, 'type': 'terms' } } @@ -1093,6 +1149,10 @@ def test_entity_indices(self): } ], 'bundles': [ + { + 'bundle_uuid': '2370f948-2783-aeb6-afea-e022897f4dcf', + 'bundle_version': '2022-06-01T00:00:00.000000Z' + }, { 'bundle_uuid': '826dea02-e274-affe-aabc-eb3db63ad068', 'bundle_version': '2022-06-01T00:00:00.000000Z' @@ -1159,7 +1219,9 @@ def test_entity_indices(self): 'data_modality': [ None ], - 'accessible': True + 'accessible': True, + 'description': 'Study description from DUOS', + 'duos_id': 'DUOS-000000' } ], 'diagnoses': [