Skip to content

Commit

Permalink
BUG: Fix bug with path search (#1379)
Browse files Browse the repository at this point in the history
* BUG: Fix bug with path search

* FIX: Simplify

* FIX: More

* tweak docstring [ci skip]

---------

Co-authored-by: Daniel McCloy <dan@mccloy.info>
  • Loading branch information
larsoner and drammock authored Feb 25, 2025
1 parent 7b04694 commit 9619b0c
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 53 deletions.
4 changes: 2 additions & 2 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Detailed list of changes
- :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`)
- :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`)
- Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`)
- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`)
- Path matching is now implemenented in a more efficient manner within :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`)
- :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`)


Expand All @@ -56,7 +56,7 @@ Detailed list of changes
- :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
- BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath` correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`)
- :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`)
- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`)
- Corrects extension when filtering filenames in :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`)

⚕️ Code health
^^^^^^^^^^^^^^
Expand Down
22 changes: 11 additions & 11 deletions mne_bids/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from copy import deepcopy
from datetime import datetime
from io import StringIO
from itertools import chain as iter_chain
from os import path as op
from pathlib import Path
from textwrap import indent
Expand Down Expand Up @@ -2014,9 +2013,12 @@ def get_entity_vals(
ignore_suffixes : str | array-like of str | None
Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for
ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`.
.. versionadded:: 0.17
include_match : str | array-like of str | None
Apply a starting match pragma following Unix style pattern syntax from
package glob to prefilter search criterion.
Glob-style pattern(s) of *directories* to include in the search (i.e., each
must end with ``"/"``). ``None`` (the default) is equivalent to ``"**/"``
(search within any subdirectory of the BIDS root).
.. versionadded:: 0.17
with_key : bool
Expand Down Expand Up @@ -2117,8 +2119,7 @@ def get_entity_vals(
search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*"
if include_match is not None:
include_match = _ensure_tuple(include_match)
filenames = [root.glob(im + search_str) for im in include_match]
filenames = iter_chain(*filenames)
filenames = [f for im in include_match for f in root.glob(im + search_str)]
else:
filenames = root.glob(search_str)

Expand Down Expand Up @@ -2541,21 +2542,20 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False
root = Path(root) # if root is str

if datatype is None and not ignore_nosub:
search_str = "*.*"
paths = root.rglob(search_str)
paths = root.rglob("*.*")
else:
if datatype is not None:
datatype = _ensure_tuple(datatype)
search_str = f"**/{'|'.join(datatype)}/.*"
search_str = f"**/{'|'.join(datatype)}/*.*"
else:
search_str = "**/*.*"

# only browse files which are of the form root/sub-*,
# such that we truely only look in 'sub'-folders:

if ignore_nosub:
search_str = "sub-*/" + search_str

search_str = f"sub-*/{search_str}"
# TODO: Why is this not equivalent to list(root.rglob(search_str)) ?
# Most of the speedup is from using glob.iglob here.
paths = [
Path(root, fn)
for fn in glob.iglob(search_str, root_dir=root, recursive=True)
Expand Down
95 changes: 55 additions & 40 deletions mne_bids/tests/test_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from pathlib import Path

import mne
import numpy as np
import pytest
from mne.datasets import testing
from mne.io import anonymize_info
Expand Down Expand Up @@ -172,8 +171,8 @@ def test_path_benchmark(tmp_path_factory):
"""Benchmark exploring bids tree."""
# This benchmark is to verify the speed-up in function call get_entity_vals with
# `include_match=sub-*/` in face of a bids tree hosting derivatives and sourcedata.
n_subjects = 20
n_sessions = 10
n_subjects = 10
n_sessions = 5
n_derivatives = 17
tmp_bids_root = tmp_path_factory.mktemp("mnebids_utils_test_bids_ds")

Expand Down Expand Up @@ -221,50 +220,45 @@ def test_path_benchmark(tmp_path_factory):

# apply nosub on find_matching_matchs with root level bids directory should
# yield a performance boost of order of length from bids_subdirectories.
timed_all, timed_ignored_nosub = (
timeit.timeit(
"mne_bids.find_matching_paths(tmp_bids_root)",
setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
number=1,
),
timeit.timeit(
"mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
number=10,
)
/ 10,
setup = "import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'"
timed_all = timeit.timeit(
"mne_bids.find_matching_paths(tmp_bids_root)", setup=setup, number=1
)

assert (
timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
# while this should be of same order, lets give it some space by a factor of 2
> 0.5 * timed_ignored_nosub
timed_ignored_nosub = timeit.timeit(
"mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
setup=setup,
number=1,
)

# while this should be of same order, lets give it some space by a factor of 2
target = 2 * timed_all / len(bids_subdirectories)
assert timed_ignored_nosub < target

# apply include_match on get_entity_vals with root level bids directory should
# yield a performance boost of order of length from bids_subdirectories.
timed_entity, timed_entity_match = (
timeit.timeit(
"mne_bids.get_entity_vals(tmp_bids_root, 'session')",
setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
number=1,
),
timeit.timeit(
"mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501
setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
number=10,
)
/ 10,
timed_entity = timeit.timeit(
"mne_bids.get_entity_vals(tmp_bids_root, 'session')",
setup=setup,
number=1,
)

assert (
timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
# while this should be of same order, lets give it some space by a factor of 2
> 0.5 * timed_entity_match
timed_entity_match = timeit.timeit(
"mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501
setup=setup,
number=1,
)

# Clean up
shutil.rmtree(tmp_bids_root)
# while this should be of same order, lets give it some space by a factor of 2
target = 2 * timed_entity / len(bids_subdirectories)
assert timed_entity_match < target

# and these should be equivalent
out_1 = get_entity_vals(tmp_bids_root, "session")
out_2 = get_entity_vals(tmp_bids_root, "session", include_match="**/")
assert out_1 == out_2
out_3 = get_entity_vals(tmp_bids_root, "session", include_match="sub-*/")
assert out_2 == out_3 # all are sub-* vals
out_4 = get_entity_vals(tmp_bids_root, "session", include_match="none/")
assert out_4 == []


def test_search_folder_for_text(capsys):
Expand Down Expand Up @@ -1046,7 +1040,7 @@ def test_filter_fnames(entities, expected_n_matches):


@testing.requires_testing_data
def test_match(return_bids_test_dir):
def test_match_basic(return_bids_test_dir):
"""Test retrieval of matching basenames."""
bids_root = Path(return_bids_test_dir)

Expand Down Expand Up @@ -1140,6 +1134,27 @@ def test_match(return_bids_test_dir):
bids_path_01.fpath.unlink() # clean up created file


def test_match_advanced(tmp_path):
"""Test additional match functionality."""
bids_root = tmp_path
fnames = (
"sub-01/nirs/sub-01_task-tapping_events.tsv",
"sub-02/nirs/sub-02_task-tapping_events.tsv",
)
for fname in fnames:
this_path = Path(bids_root / fname)
this_path.parent.mkdir(parents=True, exist_ok=True)
this_path.touch()
path = BIDSPath(
root=bids_root,
datatype="nirs",
suffix="events",
extension=".tsv",
)
matches = path.match()
assert len(matches) == len(fnames), path


@testing.requires_testing_data
def test_find_matching_paths(return_bids_test_dir):
"""We test by yielding the same results as BIDSPath.match().
Expand Down

0 comments on commit 9619b0c

Please sign in to comment.