[u r] Remove support for indexing TDR datasets (#6426)

DataBiosphere · Aug 1, 2024 · b2705b5 · b2705b5
1 parent 94525aa
commit b2705b5
Show file tree

Hide file tree

Showing 20 changed files with 98 additions and 156 deletions.
diff --git a/OPERATOR.rst b/OPERATOR.rst
@@ -278,7 +278,7 @@ To specify a catalog to be reindexed, set ``Key`` to ``azul_current_catalog``
 and ``Value`` to the name of the catalog, for example, ``dcp3``. To specify the
 sources to be reindexed, set ``Key`` to ``azul_current_sources`` and
 ``Value`` to a space-separated list of sources globs, e.g.
-``*:snapshot/hca_dev_* *:snapshot/lungmap_dev_*``. Check the inputs you just
+``*:hca_dev_* *:lungmap_dev_*``. Check the inputs you just
 made. Start the ``reindex`` job by clicking on ``Run job``. Wait until the job
 has completed.
 

diff --git a/UPGRADING.rst b/UPGRADING.rst
@@ -20,6 +20,16 @@ reverted. This is all fairly informal and loosely defined. Hopefully we won't
 have too many entries in this file.
 
 
+#6426 Clean-up and generalize TDR source specs
+==============================================
+
+The "snapshot/" string has been removed from TDR source specs.
+
+Update the ``mksrc`` function in ``environment.py`` for each of your personal
+deployments. As always, use the sandbox deployment's ``environment.py`` as a
+model when upgrading personal deployments.
+
+
 #6381 Update Terraform to 1.9.x
 ===============================
 

diff --git a/deployments/anvilbox/environment.py b/deployments/anvilbox/environment.py
@@ -41,7 +41,7 @@ def mksrc(google_project,
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         prefix + '/0'
     ])
     return project, source

diff --git a/deployments/anvildev/environment.py b/deployments/anvildev/environment.py
@@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         '/' + str(partition_prefix_length(subgraphs))
     ])
     return project, source

diff --git a/deployments/anvilprod/environment.py b/deployments/anvilprod/environment.py
@@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         '/' + str(partition_prefix_length(subgraphs))
     ])
     return project, source

diff --git a/deployments/dev/environment.py b/deployments/dev/environment.py
@@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         '/' + str(partition_prefix_length(subgraphs))
     ])
     return project, source

diff --git a/deployments/hammerbox/environment.py b/deployments/hammerbox/environment.py
@@ -41,7 +41,7 @@ def mksrc(google_project,
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         prefix + '/0'
     ])
     return project, source

diff --git a/deployments/prod/environment.py b/deployments/prod/environment.py
@@ -30,7 +30,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         '/' + str(partition_prefix_length(subgraphs))
     ])
     return project, source

diff --git a/deployments/sandbox/environment.py b/deployments/sandbox/environment.py
@@ -41,7 +41,7 @@ def mksrc(google_project,
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         prefix + '/0'
     ])
     return project, source

diff --git a/deployments/tempdev/environment.py b/deployments/tempdev/environment.py
@@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
     source = None if flags & pop else ':'.join([
         'tdr',
         google_project,
-        'snapshot/' + snapshot,
+        snapshot,
         '/' + str(partition_prefix_length(subgraphs))
     ])
     return project, source

diff --git a/environment.py b/environment.py
@@ -70,15 +70,13 @@ def env() -> Mapping[str, Optional[str]]:
         #
         # The first catalog listed is the default catalog.
         #
-        # A source represents a TDR dataset, TDR snapshot, or canned staging
-        # area to index. Each source is a string matching the following EBNF
-        # grammar:
+        # A source represents a TDR snapshot or canned staging area to index.
+        # Each source is a string matching the following EBNF grammar:
         #
         # source = TDR source | canned source ;
         #
         # TDR source = 'tdr:', Google Cloud project name,
-        #              ':', ( 'dataset' | 'snapshot' ),
-        #              '/', TDR dataset or snapshot name,
+        #              ':', TDR dataset or snapshot name,
         #              ':', [ prefix ],
         #              '/', partition prefix length ;
         #
@@ -116,8 +114,7 @@ def env() -> Mapping[str, Optional[str]]:
         #
         # Examples:
         #
-        # tdr:broad-jade-dev-data:snapshot/hca_mvp:/1
-        # tdr:broad-jade-dev-data:dataset/hca_mvp:2/1
+        # tdr:broad-jade-dev-data:hca_mvp:/1
         # https://github.com/HumanCellAtlas/schema-test-data/tree/de355ca/tests:2
         #
         # This variable tends to be large. If you get `Argument list too long`

diff --git a/scripts/recan_bundle_tdr.py b/scripts/recan_bundle_tdr.py
@@ -240,7 +240,6 @@ def __init__(self, bundle: TDRHCABundle, file_name: str):
         assert self.concrete_type.endswith('_file')
         self.file_manifest_entry = one(e for e in bundle.manifest
                                        if e['name'] == self.metadata['file_core']['file_name'])
-        assert bundle.fqid.source.spec.is_snapshot
         assert self.file_manifest_entry['drs_path'] is not None
 
     def to_json_row(self) -> JSON:
@@ -368,7 +367,7 @@ def main(argv):
                         help='The UUID of the existing DCP/1 canned bundle.')
     parser.add_argument('--source-id', '-s',
                         default=TestTDRHCAPlugin.source.id,
-                        help='The UUID of the snapshot/dataset to contain the canned DCP/2 bundle.')
+                        help='The UUID of the snapshot to contain the canned DCP/2 bundle.')
     parser.add_argument('--version', '-v',
                         default=TestTDRHCAPlugin.bundle_fqid.version,
                         help='The version for any mock entities synthesized by the script.')
@@ -401,8 +400,7 @@ def main(argv):
     tdr_source = TDRSourceRef(id=args.source_id,
                               spec=TDRSourceSpec(prefix=Prefix.of_everything,
                                                  subdomain='test_project',
-                                                 name='test_name',
-                                                 is_snapshot=True))
+                                                 name='test_name'))
     tdr_bundle = dss_bundle_to_tdr(dss_bundle, tdr_source)
 
     add_supp_files(tdr_bundle,

diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py
@@ -5,7 +5,6 @@
     ThreadPoolExecutor,
 )
 from itertools import (
-    groupby,
     islice,
 )
 import json
@@ -39,7 +38,6 @@
 )
 from azul.bigquery import (
     BigQueryRow,
-    BigQueryRows,
     backtick,
 )
 from azul.drs import (
@@ -276,27 +274,21 @@ def _parse_drs_uri(self,
                        file_id: Optional[str],
                        descriptor: JSON
                        ) -> Optional[str]:
-        # The file_id column is present for datasets, but is usually null, may
-        # contain unexpected/unusable values, and NEVER produces usable DRS URLs,
-        # so we avoid parsing the column altogether for datasets.
-        if self.fqid.source.spec.is_snapshot:
-            if file_id is None:
-                try:
-                    external_drs_uri = descriptor['drs_uri']
-                except KeyError:
-                    raise RequirementError('`file_id` is null and `drs_uri` '
-                                           'is not set in file descriptor', descriptor)
-                else:
-                    # FIXME: Support non-null DRS URIs in file descriptors
-                    #        https://github.com/DataBiosphere/azul/issues/3631
-                    if external_drs_uri is not None:
-                        log.warning('Non-null `drs_uri` in file descriptor (%s)', external_drs_uri)
-                        external_drs_uri = None
-                    return external_drs_uri
+        if file_id is None:
+            try:
+                external_drs_uri = descriptor['drs_uri']
+            except KeyError:
+                raise RequirementError('`file_id` is null and `drs_uri` '
+                                       'is not set in file descriptor', descriptor)
             else:
-                return file_id
+                # FIXME: Support non-null DRS URIs in file descriptors
+                #        https://github.com/DataBiosphere/azul/issues/3631
+                if external_drs_uri is not None:
+                    log.warning('Non-null `drs_uri` in file descriptor (%s)', external_drs_uri)
+                    external_drs_uri = None
+                return external_drs_uri
         else:
-            return None
+            return file_id
 
 
 class Plugin(TDRPlugin[TDRHCABundle, TDRSourceSpec, TDRSourceRef, TDRBundleFQID]):
@@ -324,7 +316,7 @@ def _list_bundles(self,
                       ) -> list[TDRBundleFQID]:
         source_prefix = source.spec.prefix.common
         validate_uuid_prefix(source_prefix + prefix)
-        current_bundles = self._query_latest_version(source.spec, f'''
+        current_bundles = self._query_unique_sorted(f'''
             SELECT links_id, version
             FROM {backtick(self._full_table_name(source.spec, 'links'))}
             WHERE STARTS_WITH(links_id, '{source_prefix + prefix}')
@@ -336,24 +328,15 @@ def _list_bundles(self,
             for row in current_bundles
         ]
 
-    def _query_latest_version(self,
-                              source: TDRSourceSpec,
-                              query: str,
-                              group_by: str
-                              ) -> list[BigQueryRow]:
+    def _query_unique_sorted(self,
+                             query: str,
+                             group_by: str
+                             ) -> list[BigQueryRow]:
         iter_rows = self._run_sql(query)
         key = itemgetter(group_by)
-        groups = groupby(sorted(iter_rows, key=key), key=key)
-        return [self._choose_one_version(source, group) for _, group in groups]
-
-    def _choose_one_version(self,
-                            source: TDRSourceSpec,
-                            versioned_items: BigQueryRows
-                            ) -> BigQueryRow:
-        if source.is_snapshot:
-            return one(versioned_items)
-        else:
-            return max(versioned_items, key=itemgetter('version'))
+        rows = sorted(iter_rows, key=key)
+        require(len(set(map(key, rows))) == len(rows), 'Expected unique keys', group_by)
+        return rows
 
     def _emulate_bundle(self, bundle_fqid: TDRBundleFQID) -> TDRHCABundle:
         bundle = TDRHCABundle(fqid=bundle_fqid,
@@ -514,7 +497,7 @@ def quote(s):
             WHERE {self._in(where_columns, where_values)}
         '''
         log.debug('Retrieving %i entities of type %r ...', len(entity_ids), entity_type)
-        rows = self._query_latest_version(source, query, group_by=pk_column)
+        rows = self._query_unique_sorted(query, group_by=pk_column)
         log.debug('Retrieved %i entities of type %r', len(rows), entity_type)
         missing = expected - {row[pk_column] for row in rows}
         require(not missing,