Skip to content

Commit

Permalink
Merge pull request #1073 from lsst/tickets/DM-45860-hotfix
Browse files Browse the repository at this point in the history
DM-45860: Do not do the collections query again unless we have to
  • Loading branch information
timj authored Sep 9, 2024
2 parents aa622dc + 8d55d2f commit cca901d
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 18 deletions.
15 changes: 8 additions & 7 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .dimensions import DataCoordinate, DimensionConfig
from .registry import RegistryConfig, _RegistryFactory
from .repo_relocation import BUTLER_ROOT_TAG
from .utils import has_globs

if TYPE_CHECKING:
from ._dataset_existence import DatasetExistence
Expand Down Expand Up @@ -1688,17 +1689,17 @@ def query_datasets(
data_id = DataCoordinate.make_empty(self.dimensions)
if order_by is None:
order_by = []
if collections:
if collections and has_globs(collections):
# Wild cards need to be expanded but can only be allowed if
# find_first=False because expanding wildcards does not return
# a guaranteed ordering.
expanded_collections = self.collections.query(collections)
if find_first and set(expanded_collections) != set(ensure_iterable(collections)):
# a guaranteed ordering. Querying collection registry to expand
# collections when we do not have wildcards is expensive so only
# do it if we need it.
if find_first:
raise RuntimeError(
"Can not use wildcards in collections when find_first=True "
f" (given {collections} which expanded to {expanded_collections})"
f"Can not use wildcards in collections when find_first=True (given {collections})"
)
collections = expanded_collections
collections = self.collections.query(collections)
query_limit = limit
warn_limit = False
if limit is not None and limit < 0:
Expand Down
9 changes: 7 additions & 2 deletions python/lsst/daf/butler/cli/opt/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,16 @@ def _config_split(*args: Any) -> dict[str | None, str]:
)


_default_limit = -20_000
limit_option = MWOptionDecorator(
"--limit",
help=unwrap("Limit the number of records, by default all records are shown."),
help=unwrap(
f"""Limit the number of results that are processed. 0 means no limit. A negative
value specifies a cap where a warning will be issued if the cap is hit.
Default value is {_default_limit}."""
),
type=int,
default=0,
default=_default_limit,
)

offset_option = MWOptionDecorator(
Expand Down
21 changes: 19 additions & 2 deletions python/lsst/daf/butler/script/queryDataIds.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ def __init__(self, dataIds: Iterable[DataCoordinate]):
# use dict to store dataIds as keys to preserve ordering
self.dataIds = dict.fromkeys(dataIds)

def __len__(self) -> int:
return len(self.dataIds)

def pop_last(self) -> None:
if self.dataIds:
final_key = list(self.dataIds.keys())[-1]
self.dataIds.pop(final_key)

def getAstropyTable(self, order: bool) -> AstropyTable:
"""Get the table as an astropy table.
Expand Down Expand Up @@ -195,12 +203,21 @@ def queryDataIds(
results = results.where(where)
if order_by:
results = results.order_by(*order_by)
if limit > 0:
results = results.limit(limit)
query_limit = abs(limit)
warn_limit = False
if limit != 0:
if limit < 0:
query_limit += 1
warn_limit = True

results = results.limit(query_limit)

if results.any(exact=False):
if results.dimensions:
table = _Table(results)
if warn_limit and len(table) == query_limit:
table.pop_last()
_LOG.warning("More data IDs are available than the request limit of %d", abs(limit))
if not table.dataIds:
return None, "Post-query region filtering removed all rows, since nothing overlapped."
return table.getAstropyTable(not order_by), None
Expand Down
16 changes: 14 additions & 2 deletions python/lsst/daf/butler/script/queryDimensionRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from __future__ import annotations

import logging
from operator import attrgetter
from typing import Any

Expand All @@ -36,6 +37,8 @@
from .._butler import Butler
from .._timespan import Timespan

_LOG = logging.getLogger(__name__)


def queryDimensionRecords(
repo: str,
Expand Down Expand Up @@ -100,10 +103,19 @@ def queryDimensionRecords(
query_results = query_results.where(where)
if order_by:
query_results = query_results.order_by(*order_by)
if limit > 0:
query_results = query_results.limit(limit)
query_limit = abs(limit)
warn_limit = False
if limit != 0:
if limit < 0:
query_limit += 1
warn_limit = True

query_results = query_results.limit(query_limit)

records = list(query_results)
if warn_limit and len(records) == query_limit:
records.pop(-1)
_LOG.warning("More data IDs are available than the request limit of %d", abs(limit))

if not records:
return None
Expand Down
29 changes: 26 additions & 3 deletions python/lsst/daf/butler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import functools
import logging
import re
from collections.abc import Callable
from collections.abc import Callable, Iterable
from re import Pattern
from types import EllipsisType
from typing import Any, TypeVar
Expand Down Expand Up @@ -92,7 +92,7 @@ def stripIfNotNone(s: str | None) -> str | None:
return s


def globToRegex(expressions: str | EllipsisType | None | list[str]) -> list[str | Pattern] | EllipsisType:
def globToRegex(expressions: str | EllipsisType | None | Iterable[str]) -> list[str | Pattern] | EllipsisType:
"""Translate glob-style search terms to regex.
If a stand-alone '``*``' is found in ``expressions``, or expressions is
Expand All @@ -101,7 +101,7 @@ def globToRegex(expressions: str | EllipsisType | None | list[str]) -> list[str
Parameters
----------
expressions : `str` or `list` [`str`]
expressions : `str` or `~collections.abc.Iterable` [`str`]
A list of glob-style pattern strings to convert.
Returns
Expand Down Expand Up @@ -134,6 +134,29 @@ def globToRegex(expressions: str | EllipsisType | None | list[str]) -> list[str
return results


def has_globs(expressions: str | EllipsisType | None | Iterable[str]) -> bool:
"""Determine if the expressions have any glob wildcard characters.
Parameters
----------
expressions : `str` or `~collections.abc.Iterable` [`str`]
A list of glob-style pattern strings to check.
Returns
-------
has_globs : `bool`
`True` if any of the supplied strings contain glob patterns.
"""
expanded = globToRegex(expressions)
if expanded is ...:
return True

for result in expanded:
if not isinstance(result, str):
return True
return False


class _Marker:
"""Private class to use as a default value when you want to know that
a default parameter has been over-ridden.
Expand Down
14 changes: 12 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from collections import namedtuple

from lsst.daf.butler import NamedKeyDict, NamedValueSet
from lsst.daf.butler.utils import globToRegex
from lsst.daf.butler.utils import globToRegex, has_globs

TESTDIR = os.path.dirname(__file__)

Expand Down Expand Up @@ -187,7 +187,9 @@ def testStarInList(self):
"""Test that if a one of the items in the expression list is a star
(stand-alone) then ``...`` is returned (which implies no restrictions)
"""
self.assertIs(globToRegex(["foo", "*", "bar"]), ...)
globs = ["foo", "*", "bar"]
self.assertIs(globToRegex(globs), ...)
self.assertTrue(has_globs(globs))

def testGlobList(self):
"""Test that a list of glob strings converts as expected to a regex and
Expand All @@ -196,6 +198,8 @@ def testGlobList(self):
# These strings should be returned unchanged.
strings = ["bar", "😺", "ingésτ", "ex]", "[xe", "[!no", "e[x"]
self.assertEqual(globToRegex(strings), strings)
self.assertFalse(has_globs(strings))
self.assertTrue(has_globs("ba*"))

# Globs with strings that match the glob and strings that do not.
tests = (
Expand All @@ -215,6 +219,12 @@ def testGlobList(self):
for no_match in no_matches:
self.assertIsNone(re.fullmatch(patterns[0], no_match))

# All of them are globs except "bar".
if glob == "bar":
self.assertFalse(has_globs(glob))
else:
self.assertTrue(has_globs(glob))


if __name__ == "__main__":
unittest.main()

0 comments on commit cca901d

Please sign in to comment.