Skip to content

Commit

Permalink
Fix collection glob detection in query-datasets.
Browse files Browse the repository at this point in the history
CHAINED collections could masquerade as globs with the previous logic.
With the new logic, we're probably trying to turn each string into a
regex more than one time, but that shouldn't be very expensive.
  • Loading branch information
TallJimbo committed Sep 16, 2024
1 parent 7d9b803 commit bdc0e5f
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/changes/DM-46339.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug in `butler query-datasets` that incorrectly rejected a find-first query against a chain collection as having a glob.
3 changes: 2 additions & 1 deletion python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from .._butler import Butler
from ..cli.utils import sortAstropyTable
from ..utils import has_globs

if TYPE_CHECKING:
from lsst.daf.butler import DatasetRef
Expand Down Expand Up @@ -261,7 +262,7 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
summary_datasets=dataset_types,
)
expanded_query_collections = [c.name for c in query_collections_info]
if self._find_first and set(query_collections) != set(expanded_query_collections):
if self._find_first and any(has_globs(c) for c in query_collections):
raise RuntimeError("Can not use wildcards in collections when find_first=True")
query_collections = expanded_query_collections

Expand Down
15 changes: 12 additions & 3 deletions tests/test_cliCmdQueryDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import unittest

from astropy.table import Table as AstropyTable
from lsst.daf.butler import StorageClassFactory, script
from lsst.daf.butler import CollectionType, StorageClassFactory, script
from lsst.daf.butler.tests import addDatasetType
from lsst.daf.butler.tests.utils import ButlerTestHelper, MetricTestRepo, makeTestTempDir, removeTestTempDir
from lsst.resources import ResourcePath
Expand Down Expand Up @@ -370,8 +370,13 @@ def testFindFirstAndCollections(self):
# Add a new run, and add a dataset to shadow an existing dataset.
testRepo.addDataset(run="foo", dataId={"instrument": "DummyCamComp", "visit": 424})

# Add a CHAINED collection to include the two runs, to check that
# flattening works as well.
testRepo.butler.collections.register("chain", CollectionType.CHAINED)
testRepo.butler.collections.redefine_chain("chain", ["foo", "ingest/run"])

# Verify that without find-first, duplicate datasets are returned
tables = self._queryDatasets(repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True)
tables = self._queryDatasets(repo=testRepo.butler, collections=["chain"], show_uri=True)

# The test should be running with a single FileDatastore.
roots = testRepo.butler.get_datastore_roots()
Expand Down Expand Up @@ -514,7 +519,7 @@ def testFindFirstAndCollections(self):
# Verify that with find first the duplicate dataset is eliminated and
# the more recent dataset is returned.
tables = self._queryDatasets(
repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True, find_first=True
repo=testRepo.butler, collections=["chain"], show_uri=True, find_first=True
)

expectedTables = (
Expand Down Expand Up @@ -614,6 +619,10 @@ def testFindFirstAndCollections(self):

self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True)

# Verify that globs are not supported with find_first=True.
with self.assertRaises(RuntimeError):
self._queryDatasets(repo=testRepo.butler, collections=["*"], show_uri=True, find_first=True)


if __name__ == "__main__":
unittest.main()

0 comments on commit bdc0e5f

Please sign in to comment.