Fix collection glob detection in query-datasets.

CHAINED collections could masquerade as globs with the previous logic. With the new logic, we're probably trying to turn each string into a regex more than one time, but that shouldn't be very expensive.
lsst · Sep 16, 2024 · bdc0e5f · bdc0e5f
1 parent 7d9b803
commit bdc0e5f
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/doc/changes/DM-46339.bugfix.md b/doc/changes/DM-46339.bugfix.md
@@ -0,0 +1 @@
+Fix a bug in `butler query-datasets` that incorrectly rejected a find-first query against a chain collection as having a glob.
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
@@ -36,6 +36,7 @@
 
 from .._butler import Butler
 from ..cli.utils import sortAstropyTable
+from ..utils import has_globs
 
 if TYPE_CHECKING:
     from lsst.daf.butler import DatasetRef
@@ -261,7 +262,7 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
             summary_datasets=dataset_types,
         )
         expanded_query_collections = [c.name for c in query_collections_info]
-        if self._find_first and set(query_collections) != set(expanded_query_collections):
+        if self._find_first and any(has_globs(c) for c in query_collections):
             raise RuntimeError("Can not use wildcards in collections when find_first=True")
         query_collections = expanded_query_collections
 

diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py
@@ -32,7 +32,7 @@
 import unittest
 
 from astropy.table import Table as AstropyTable
-from lsst.daf.butler import StorageClassFactory, script
+from lsst.daf.butler import CollectionType, StorageClassFactory, script
 from lsst.daf.butler.tests import addDatasetType
 from lsst.daf.butler.tests.utils import ButlerTestHelper, MetricTestRepo, makeTestTempDir, removeTestTempDir
 from lsst.resources import ResourcePath
@@ -370,8 +370,13 @@ def testFindFirstAndCollections(self):
         # Add a new run, and add a dataset to shadow an existing dataset.
         testRepo.addDataset(run="foo", dataId={"instrument": "DummyCamComp", "visit": 424})
 
+        # Add a CHAINED collection to include the two runs, to check that
+        # flattening works as well.
+        testRepo.butler.collections.register("chain", CollectionType.CHAINED)
+        testRepo.butler.collections.redefine_chain("chain", ["foo", "ingest/run"])
+
         # Verify that without find-first, duplicate datasets are returned
-        tables = self._queryDatasets(repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True)
+        tables = self._queryDatasets(repo=testRepo.butler, collections=["chain"], show_uri=True)
 
         # The test should be running with a single FileDatastore.
         roots = testRepo.butler.get_datastore_roots()
@@ -514,7 +519,7 @@ def testFindFirstAndCollections(self):
         # Verify that with find first the duplicate dataset is eliminated and
         # the more recent dataset is returned.
         tables = self._queryDatasets(
-            repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True, find_first=True
+            repo=testRepo.butler, collections=["chain"], show_uri=True, find_first=True
         )
 
         expectedTables = (
@@ -614,6 +619,10 @@ def testFindFirstAndCollections(self):
 
         self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True)
 
+        # Verify that globs are not supported with find_first=True.
+        with self.assertRaises(RuntimeError):
+            self._queryDatasets(repo=testRepo.butler, collections=["*"], show_uri=True, find_first=True)
+
 
 if __name__ == "__main__":
     unittest.main()