Skip to content

Commit

Permalink
[r] Support for AnVIL duos_id (#6620, PR #6668)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Jan 22, 2025
2 parents f495a1e + 5446fe0 commit 935f10b
Show file tree
Hide file tree
Showing 11 changed files with 172 additions and 65 deletions.
13 changes: 13 additions & 0 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
'registered_identifier',
'title',
'data_modality',
# This field path has a brittle coupling that must be
# maintained to the field lookup in
# `self.manifest_config`.
'duos_id',
]
},
'donors': {
Expand Down Expand Up @@ -284,6 +288,11 @@ def manifest_config(self) -> ManifestConfig:
# the fields listed here and those used in `self._field_mapping`.
fields_to_omit_from_manifest = [
('contents', 'activities', 'activity_table'),
# We omit the `duos_id` field from manifests since there is only one
# DUOS bundle per dataset, and that bundle only contributes to outer
# entities of the `datasets` type, not to entities of the other
# types, such as files, which the manifest is generated from.
('contents', 'datasets', 'duos_id'),
('contents', 'files', 'uuid'),
('contents', 'files', 'version'),
]
Expand Down Expand Up @@ -351,6 +360,10 @@ def verbatim_pfb_schema(self,
is_polymorphic=is_duos_type)
]
if is_duos_type:
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='duos_id',
anvil_datatype='string',
is_polymorphic=True))
field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
column_name='description',
anvil_datatype='string',
Expand Down
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,14 @@ def _duos_types(cls) -> FieldTypes:
return {
'document_id': null_str,
'description': null_str,
'duos_id': null_str,
}

def _duos(self, dataset: EntityReference) -> MutableJSON:
return self._entity(dataset, self._duos_types())

def _is_duos(self, dataset: EntityReference) -> bool:
return 'description' in self.bundle.entities[dataset]
return 'duos_id' in self.bundle.entities[dataset]

def _dataset(self, dataset: EntityReference) -> MutableJSON:
if self._is_duos(dataset):
Expand Down
1 change: 1 addition & 0 deletions src/azul/plugins/metadata/anvil/service/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]:
},
'datasets': {
'dataset_id',
'duos_id',
'title'
},
'diagnoses': {
Expand Down
29 changes: 17 additions & 12 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,20 @@ class BundleType(Enum):
supplementary bundle to emit contributions for them, hence we treat them as
orphans.
DUOS bundles consist of a single dataset entity. This "entity" includes only
the dataset description retrieved from DUOS, while a copy of the BigQuery
row for this dataset is also included as an orphan. We chose this design
because there is only one dataset per snapshot, which is referenced in all
bundles. Therefore, only one request to DUOS per *snapshot* is necessary. If
the DUOS `description` were retrieved at the same time as the other fields
of the dataset entity, we would make one request per *bundle* instead,
potentially overloading the DUOS service. Our solution is to retrieve
`description` only in a bundle of this dedicated DUOS type, once per
snapshot, and merge it with the other dataset fields during aggregation.
DUOS bundles consist of a single dataset entity. This "entity" includes the
DUOS ID retrieved from TDR and dataset description retrieved from DUOS,
while a copy of the BigQuery row for this dataset is also included as an
orphan. We chose this design because there is only one dataset per snapshot,
which is referenced in all bundles. Therefore, only one request to DUOS per
*snapshot* is necessary. If the DUOS `description` were retrieved at the
same time as the other fields of the dataset entity, we would make one
request per *bundle* instead, potentially overloading the DUOS service. Our
solution is to retrieve `description` only in a bundle of this dedicated
DUOS type, once per snapshot, and merge it with the other dataset fields
during aggregation. As a result, `duos_id` cannot be included in file
manifests since there is only one DUOS bundle per dataset, and that bundle
only contributes to outer entities of the `datasets` type, not to entities
of the other types, such as files, which the manifest is generated from.
All other bundles are replica bundles. Replica bundles consist of a batch of
rows from an arbitrary BigQuery table, which may or may not be described by
Expand Down Expand Up @@ -479,15 +483,16 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund

def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
assert not bundle_fqid.is_batched, bundle_fqid
duos_info = self.tdr.get_duos(bundle_fqid.source)
duos_id, duos_info = self.tdr.get_duos(bundle_fqid.source)
description = None if duos_info is None else duos_info.get('studyDescription')
ref, row = self._get_dataset(bundle_fqid.source.spec)
expected_entity_id = change_version(bundle_fqid.uuid,
self.bundle_uuid_version,
self.datarepo_row_uuid_version)
assert ref.entity_id == expected_entity_id, (ref, bundle_fqid)
bundle = TDRAnvilBundle(fqid=bundle_fqid)
bundle.add_entity(ref, self._version, {'description': description})
entity_row = {'duos_id': duos_id, 'description': description}
bundle.add_entity(ref, self._version, entity_row)
# Classify as orphan to suppress the emission of a contribution
bundle.add_entity(ref, self._version, dict(row), is_orphan=True)
return bundle
Expand Down
14 changes: 10 additions & 4 deletions src/azul/terra.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,19 +646,25 @@ def for_registered_user(cls, authentication: OAuth2) -> 'TDRClient':
def drs_client(self) -> DRSClient:
return DRSClient(http_client=self._http_client)

def get_duos(self, source: TDRSourceRef) -> Optional[MutableJSON]:
def get_duos(self,
source: TDRSourceRef
) -> tuple[str, MutableJSON] | tuple[None, None]:
response = self._retrieve_source(source)
try:
duos_id = response['duosFirecloudGroup']['duosId']
except (KeyError, TypeError):
log.warning('No DUOS ID available for %r', source.spec)
return None
return None, None
else:
url = self._duos_endpoint('dataset', 'registration', duos_id)
response = self._request('GET', url)
if response.status == 404:
log.warning('No DUOS dataset registration with ID %r from %r',
duos_id, source.spec)
return None
return None, None
else:
return self._check_response(url, response)
response = self._check_response(url, response)
consent_group = one(response['consentGroups'])
require(duos_id == consent_group['datasetIdentifier'],
'Mismatched identifiers', duos_id, consent_group)
return duos_id, response

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def setUpClass(cls) -> None:

mock_duos_url = furl('https:://mock_duos.lan')

duos_id = 'foo'
duos_id = 'DUOS-000000'
duos_description = 'Study description from DUOS'

@classmethod
Expand All @@ -93,6 +93,9 @@ def _patch_duos(cls) -> None:
}
})),
Mock(spec=HTTPResponse, status=200, data=json.dumps({
'consentGroups': [{
'datasetIdentifier': cls.duos_id
}],
'studyDescription': cls.duos_description
}))
]))
Expand Down Expand Up @@ -251,8 +254,9 @@ def test_dataset_description(self):
# These fields are populated only in the primary bundle
self.assertEqual(dataset_ref.entity_id, contents['document_id'])
self.assertEqual(['phs000693'], contents['registered_identifier'])
# This field is populated only in the DUOS bundle
# These fields are populated only in the DUOS bundle
self.assertEqual('Study description from DUOS', contents['description'])
self.assertEqual('DUOS-000000', contents['duos_id'])
else:
self.fail(qualifier)
self.assertDictEqual(doc_counts, {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
"datarepo_row_id": null,
"dataset_id": null,
"description": "Study description from DUOS",
"duos_id": "DUOS-000000",
"owner": null,
"principal_investigator": null,
"registered_identifier": null,
Expand Down Expand Up @@ -282,6 +283,7 @@
"datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf",
"dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739",
"description": null,
"duos_id": null,
"owner": [
"Debbie Nickerson"
],
Expand Down
8 changes: 8 additions & 0 deletions test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,14 @@
"string"
]
},
{
"name": "duos_id",
"namespace": "anvil_dataset",
"type": [
"null",
"string"
]
},
{
"name": "owner",
"namespace": "anvil_dataset",
Expand Down
4 changes: 4 additions & 0 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,10 @@ def bundles(cls) -> list[SourcedBundleFQID]:
def test_compact_manifest(self):
response = self._get_manifest(ManifestFormat.compact, filters={})
self.assertEqual(200, response.status_code)
# The `duos_id` field is absent from manifests since there is only one
# DUOS bundle per dataset, and that bundle only contributes to outer
# entities of the `datasets` type, not to entities of the other types,
# such as files, which the manifest is generated from.
expected = [
(
'bundles.bundle_uuid',
Expand Down
Loading

0 comments on commit 935f10b

Please sign in to comment.