Skip to content

Commit

Permalink
[u r] Remove support for indexing TDR datasets (#6426)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Aug 1, 2024
1 parent 94525aa commit b2705b5
Show file tree
Hide file tree
Showing 20 changed files with 98 additions and 156 deletions.
2 changes: 1 addition & 1 deletion OPERATOR.rst
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ To specify a catalog to be reindexed, set ``Key`` to ``azul_current_catalog``
and ``Value`` to the name of the catalog, for example, ``dcp3``. To specify the
sources to be reindexed, set ``Key`` to ``azul_current_sources`` and
``Value`` to a space-separated list of sources globs, e.g.
``*:snapshot/hca_dev_* *:snapshot/lungmap_dev_*``. Check the inputs you just
``*:hca_dev_* *:lungmap_dev_*``. Check the inputs you just
made. Start the ``reindex`` job by clicking on ``Run job``. Wait until the job
has completed.

Expand Down
10 changes: 10 additions & 0 deletions UPGRADING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ reverted. This is all fairly informal and loosely defined. Hopefully we won't
have too many entries in this file.


#6426 Clean-up and generalize TDR source specs
==============================================

The "snapshot/" string has been removed from TDR source specs.

Update the ``mksrc`` function in ``environment.py`` for each of your personal
deployments. As always, use the sandbox deployment's ``environment.py`` as a
model when upgrading personal deployments.


#6381 Update Terraform to 1.9.x
===============================

Expand Down
2 changes: 1 addition & 1 deletion deployments/anvilbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def mksrc(google_project,
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
prefix + '/0'
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/anvildev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/anvilprod/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/dev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/hammerbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def mksrc(google_project,
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
prefix + '/0'
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/prod/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/sandbox/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def mksrc(google_project,
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
prefix + '/0'
])
return project, source
Expand Down
2 changes: 1 addition & 1 deletion deployments/tempdev/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def mksrc(google_project, snapshot, subgraphs, flags: int = 0) -> tuple[str, str
source = None if flags & pop else ':'.join([
'tdr',
google_project,
'snapshot/' + snapshot,
snapshot,
'/' + str(partition_prefix_length(subgraphs))
])
return project, source
Expand Down
11 changes: 4 additions & 7 deletions environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,13 @@ def env() -> Mapping[str, Optional[str]]:
#
# The first catalog listed is the default catalog.
#
# A source represents a TDR dataset, TDR snapshot, or canned staging
# area to index. Each source is a string matching the following EBNF
# grammar:
# A source represents a TDR snapshot or canned staging area to index.
# Each source is a string matching the following EBNF grammar:
#
# source = TDR source | canned source ;
#
# TDR source = 'tdr:', Google Cloud project name,
# ':', ( 'dataset' | 'snapshot' ),
# '/', TDR dataset or snapshot name,
# ':', TDR dataset or snapshot name,
# ':', [ prefix ],
# '/', partition prefix length ;
#
Expand Down Expand Up @@ -116,8 +114,7 @@ def env() -> Mapping[str, Optional[str]]:
#
# Examples:
#
# tdr:broad-jade-dev-data:snapshot/hca_mvp:/1
# tdr:broad-jade-dev-data:dataset/hca_mvp:2/1
# tdr:broad-jade-dev-data:hca_mvp:/1
# https://github.com/HumanCellAtlas/schema-test-data/tree/de355ca/tests:2
#
# This variable tends to be large. If you get `Argument list too long`
Expand Down
6 changes: 2 additions & 4 deletions scripts/recan_bundle_tdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ def __init__(self, bundle: TDRHCABundle, file_name: str):
assert self.concrete_type.endswith('_file')
self.file_manifest_entry = one(e for e in bundle.manifest
if e['name'] == self.metadata['file_core']['file_name'])
assert bundle.fqid.source.spec.is_snapshot
assert self.file_manifest_entry['drs_path'] is not None

def to_json_row(self) -> JSON:
Expand Down Expand Up @@ -368,7 +367,7 @@ def main(argv):
help='The UUID of the existing DCP/1 canned bundle.')
parser.add_argument('--source-id', '-s',
default=TestTDRHCAPlugin.source.id,
help='The UUID of the snapshot/dataset to contain the canned DCP/2 bundle.')
help='The UUID of the snapshot to contain the canned DCP/2 bundle.')
parser.add_argument('--version', '-v',
default=TestTDRHCAPlugin.bundle_fqid.version,
help='The version for any mock entities synthesized by the script.')
Expand Down Expand Up @@ -401,8 +400,7 @@ def main(argv):
tdr_source = TDRSourceRef(id=args.source_id,
spec=TDRSourceSpec(prefix=Prefix.of_everything,
subdomain='test_project',
name='test_name',
is_snapshot=True))
name='test_name'))
tdr_bundle = dss_bundle_to_tdr(dss_bundle, tdr_source)

add_supp_files(tdr_bundle,
Expand Down
61 changes: 22 additions & 39 deletions src/azul/plugins/repository/tdr_hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
ThreadPoolExecutor,
)
from itertools import (
groupby,
islice,
)
import json
Expand Down Expand Up @@ -39,7 +38,6 @@
)
from azul.bigquery import (
BigQueryRow,
BigQueryRows,
backtick,
)
from azul.drs import (
Expand Down Expand Up @@ -276,27 +274,21 @@ def _parse_drs_uri(self,
file_id: Optional[str],
descriptor: JSON
) -> Optional[str]:
# The file_id column is present for datasets, but is usually null, may
# contain unexpected/unusable values, and NEVER produces usable DRS URLs,
# so we avoid parsing the column altogether for datasets.
if self.fqid.source.spec.is_snapshot:
if file_id is None:
try:
external_drs_uri = descriptor['drs_uri']
except KeyError:
raise RequirementError('`file_id` is null and `drs_uri` '
'is not set in file descriptor', descriptor)
else:
# FIXME: Support non-null DRS URIs in file descriptors
# https://github.com/DataBiosphere/azul/issues/3631
if external_drs_uri is not None:
log.warning('Non-null `drs_uri` in file descriptor (%s)', external_drs_uri)
external_drs_uri = None
return external_drs_uri
if file_id is None:
try:
external_drs_uri = descriptor['drs_uri']
except KeyError:
raise RequirementError('`file_id` is null and `drs_uri` '
'is not set in file descriptor', descriptor)
else:
return file_id
# FIXME: Support non-null DRS URIs in file descriptors
# https://github.com/DataBiosphere/azul/issues/3631
if external_drs_uri is not None:
log.warning('Non-null `drs_uri` in file descriptor (%s)', external_drs_uri)
external_drs_uri = None
return external_drs_uri
else:
return None
return file_id


class Plugin(TDRPlugin[TDRHCABundle, TDRSourceSpec, TDRSourceRef, TDRBundleFQID]):
Expand Down Expand Up @@ -324,7 +316,7 @@ def _list_bundles(self,
) -> list[TDRBundleFQID]:
source_prefix = source.spec.prefix.common
validate_uuid_prefix(source_prefix + prefix)
current_bundles = self._query_latest_version(source.spec, f'''
current_bundles = self._query_unique_sorted(f'''
SELECT links_id, version
FROM {backtick(self._full_table_name(source.spec, 'links'))}
WHERE STARTS_WITH(links_id, '{source_prefix + prefix}')
Expand All @@ -336,24 +328,15 @@ def _list_bundles(self,
for row in current_bundles
]

def _query_latest_version(self,
source: TDRSourceSpec,
query: str,
group_by: str
) -> list[BigQueryRow]:
def _query_unique_sorted(self,
query: str,
group_by: str
) -> list[BigQueryRow]:
iter_rows = self._run_sql(query)
key = itemgetter(group_by)
groups = groupby(sorted(iter_rows, key=key), key=key)
return [self._choose_one_version(source, group) for _, group in groups]

def _choose_one_version(self,
source: TDRSourceSpec,
versioned_items: BigQueryRows
) -> BigQueryRow:
if source.is_snapshot:
return one(versioned_items)
else:
return max(versioned_items, key=itemgetter('version'))
rows = sorted(iter_rows, key=key)
require(len(set(map(key, rows))) == len(rows), 'Expected unique keys', group_by)
return rows

def _emulate_bundle(self, bundle_fqid: TDRBundleFQID) -> TDRHCABundle:
bundle = TDRHCABundle(fqid=bundle_fqid,
Expand Down Expand Up @@ -514,7 +497,7 @@ def quote(s):
WHERE {self._in(where_columns, where_values)}
'''
log.debug('Retrieving %i entities of type %r ...', len(entity_ids), entity_type)
rows = self._query_latest_version(source, query, group_by=pk_column)
rows = self._query_unique_sorted(query, group_by=pk_column)
log.debug('Retrieved %i entities of type %r', len(rows), entity_type)
missing = expected - {row[pk_column] for row in rows}
require(not missing,
Expand Down
Loading

0 comments on commit b2705b5

Please sign in to comment.