[r] Support for AnVIL duos_id (#6620, PR #6668)

DataBiosphere · Jan 22, 2025 · 935f10b · 935f10b
2 parents f495a1e + 5446fe0
commit 935f10b
Show file tree

Hide file tree

Showing 11 changed files with 172 additions and 65 deletions.
diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py
@@ -156,6 +156,10 @@ def _field_mapping(self) -> MetadataPlugin._FieldMapping:
                         'registered_identifier',
                         'title',
                         'data_modality',
+                        # This field path has a brittle coupling that must be
+                        # maintained to the field lookup in
+                        # `self.manifest_config`.
+                        'duos_id',
                     ]
                 },
                 'donors': {
@@ -284,6 +288,11 @@ def manifest_config(self) -> ManifestConfig:
         # the fields listed here and those used in `self._field_mapping`.
         fields_to_omit_from_manifest = [
             ('contents', 'activities', 'activity_table'),
+            # We omit the `duos_id` field from manifests since there is only one
+            # DUOS bundle per dataset, and that bundle only contributes to outer
+            # entities of the `datasets` type, not to entities of the other
+            # types, such as files, which the manifest is generated from.
+            ('contents', 'datasets', 'duos_id'),
             ('contents', 'files', 'uuid'),
             ('contents', 'files', 'version'),
         ]
@@ -351,6 +360,10 @@ def verbatim_pfb_schema(self,
                                                    is_polymorphic=is_duos_type)
             ]
             if is_duos_type:
+                field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
+                                                                        column_name='duos_id',
+                                                                        anvil_datatype='string',
+                                                                        is_polymorphic=True))
                 field_schemas.append(self._pfb_schema_from_anvil_column(table_name=table_name,
                                                                         column_name='description',
                                                                         anvil_datatype='string',

diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
@@ -500,13 +500,14 @@ def _duos_types(cls) -> FieldTypes:
         return {
             'document_id': null_str,
             'description': null_str,
+            'duos_id': null_str,
         }
 
     def _duos(self, dataset: EntityReference) -> MutableJSON:
         return self._entity(dataset, self._duos_types())
 
     def _is_duos(self, dataset: EntityReference) -> bool:
-        return 'description' in self.bundle.entities[dataset]
+        return 'duos_id' in self.bundle.entities[dataset]
 
     def _dataset(self, dataset: EntityReference) -> MutableJSON:
         if self._is_duos(dataset):

diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py
@@ -210,6 +210,7 @@ def _non_pivotal_fields_by_entity_type(self) -> dict[str, set[str]]:
             },
             'datasets': {
                 'dataset_id',
+                'duos_id',
                 'title'
             },
             'diagnoses': {

diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py
@@ -126,16 +126,20 @@ class BundleType(Enum):
     supplementary bundle to emit contributions for them, hence we treat them as
     orphans.
 
-    DUOS bundles consist of a single dataset entity. This "entity" includes only
-    the dataset description retrieved from DUOS, while a copy of the BigQuery
-    row for this dataset is also included as an orphan. We chose this design
-    because there is only one dataset per snapshot, which is referenced in all
-    bundles. Therefore, only one request to DUOS per *snapshot* is necessary. If
-    the DUOS `description` were retrieved at the same time as the other fields
-    of the dataset entity, we would make one request per *bundle* instead,
-    potentially overloading the DUOS service. Our solution is to retrieve
-    `description` only in a bundle of this dedicated DUOS type, once per
-    snapshot, and merge it with the other dataset fields during aggregation.
+    DUOS bundles consist of a single dataset entity. This "entity" includes the
+    DUOS ID retrieved from TDR and dataset description retrieved from DUOS,
+    while a copy of the BigQuery row for this dataset is also included as an
+    orphan. We chose this design because there is only one dataset per snapshot,
+    which is referenced in all bundles. Therefore, only one request to DUOS per
+    *snapshot* is necessary. If the DUOS `description` were retrieved at the
+    same time as the other fields of the dataset entity, we would make one
+    request per *bundle* instead, potentially overloading the DUOS service. Our
+    solution is to retrieve `description` only in a bundle of this dedicated
+    DUOS type, once per snapshot, and merge it with the other dataset fields
+    during aggregation. As a result, `duos_id` cannot be included in file
+    manifests since there is only one DUOS bundle per dataset, and that bundle
+    only contributes to outer entities of the `datasets` type, not to entities
+    of the other types, such as files, which the manifest is generated from.
 
     All other bundles are replica bundles. Replica bundles consist of a batch of
     rows from an arbitrary BigQuery table, which may or may not be described by
@@ -479,15 +483,16 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund
 
     def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
         assert not bundle_fqid.is_batched, bundle_fqid
-        duos_info = self.tdr.get_duos(bundle_fqid.source)
+        duos_id, duos_info = self.tdr.get_duos(bundle_fqid.source)
         description = None if duos_info is None else duos_info.get('studyDescription')
         ref, row = self._get_dataset(bundle_fqid.source.spec)
         expected_entity_id = change_version(bundle_fqid.uuid,
                                             self.bundle_uuid_version,
                                             self.datarepo_row_uuid_version)
         assert ref.entity_id == expected_entity_id, (ref, bundle_fqid)
         bundle = TDRAnvilBundle(fqid=bundle_fqid)
-        bundle.add_entity(ref, self._version, {'description': description})
+        entity_row = {'duos_id': duos_id, 'description': description}
+        bundle.add_entity(ref, self._version, entity_row)
         # Classify as orphan to suppress the emission of a contribution
         bundle.add_entity(ref, self._version, dict(row), is_orphan=True)
         return bundle

diff --git a/src/azul/terra.py b/src/azul/terra.py
@@ -646,19 +646,25 @@ def for_registered_user(cls, authentication: OAuth2) -> 'TDRClient':
     def drs_client(self) -> DRSClient:
         return DRSClient(http_client=self._http_client)
 
-    def get_duos(self, source: TDRSourceRef) -> Optional[MutableJSON]:
+    def get_duos(self,
+                 source: TDRSourceRef
+                 ) -> tuple[str, MutableJSON] | tuple[None, None]:
         response = self._retrieve_source(source)
         try:
             duos_id = response['duosFirecloudGroup']['duosId']
         except (KeyError, TypeError):
             log.warning('No DUOS ID available for %r', source.spec)
-            return None
+            return None, None
         else:
             url = self._duos_endpoint('dataset', 'registration', duos_id)
             response = self._request('GET', url)
             if response.status == 404:
                 log.warning('No DUOS dataset registration with ID %r from %r',
                             duos_id, source.spec)
-                return None
+                return None, None
             else:
-                return self._check_response(url, response)
+                response = self._check_response(url, response)
+                consent_group = one(response['consentGroups'])
+                require(duos_id == consent_group['datasetIdentifier'],
+                        'Mismatched identifiers', duos_id, consent_group)
+                return duos_id, response
diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json
diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py
@@ -75,7 +75,7 @@ def setUpClass(cls) -> None:
 
     mock_duos_url = furl('https:://mock_duos.lan')
 
-    duos_id = 'foo'
+    duos_id = 'DUOS-000000'
     duos_description = 'Study description from DUOS'
 
     @classmethod
@@ -93,6 +93,9 @@ def _patch_duos(cls) -> None:
                                                }
                                            })),
                                            Mock(spec=HTTPResponse, status=200, data=json.dumps({
+                                               'consentGroups': [{
+                                                   'datasetIdentifier': cls.duos_id
+                                               }],
                                                'studyDescription': cls.duos_description
                                            }))
                                        ]))
@@ -251,8 +254,9 @@ def test_dataset_description(self):
                     # These fields are populated only in the primary bundle
                     self.assertEqual(dataset_ref.entity_id, contents['document_id'])
                     self.assertEqual(['phs000693'], contents['registered_identifier'])
-                    # This field is populated only in the DUOS bundle
+                    # These fields are populated only in the DUOS bundle
                     self.assertEqual('Study description from DUOS', contents['description'])
+                    self.assertEqual('DUOS-000000', contents['duos_id'])
             else:
                 self.fail(qualifier)
         self.assertDictEqual(doc_counts, {

diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_entities.json
@@ -110,6 +110,7 @@
             "datarepo_row_id": null,
             "dataset_id": null,
             "description": "Study description from DUOS",
+            "duos_id": "DUOS-000000",
             "owner": null,
             "principal_investigator": null,
             "registered_identifier": null,
@@ -282,6 +283,7 @@
             "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf",
             "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739",
             "description": null,
+            "duos_id": null,
             "owner": [
                 "Debbie Nickerson"
             ],

diff --git a/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json b/test/service/data/manifest/verbatim/pfb/anvil/pfb_schema.json
@@ -560,6 +560,14 @@
                                 "string"
                             ]
                         },
+                        {
+                            "name": "duos_id",
+                            "namespace": "anvil_dataset",
+                            "type": [
+                                "null",
+                                "string"
+                            ]
+                        },
                         {
                             "name": "owner",
                             "namespace": "anvil_dataset",

diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py
@@ -1760,6 +1760,10 @@ def bundles(cls) -> list[SourcedBundleFQID]:
     def test_compact_manifest(self):
         response = self._get_manifest(ManifestFormat.compact, filters={})
         self.assertEqual(200, response.status_code)
+        # The `duos_id` field is absent from manifests since there is only one
+        # DUOS bundle per dataset, and that bundle only contributes to outer
+        # entities of the `datasets` type, not to entities of the other types,
+        # such as files, which the manifest is generated from.
         expected = [
             (
                 'bundles.bundle_uuid',