BUG: Fix bug with path search (#1379)

* BUG: Fix bug with path search * FIX: Simplify * FIX: More * tweak docstring [ci skip] --------- Co-authored-by: Daniel McCloy <dan@mccloy.info>
mne-tools · Feb 25, 2025 · 9619b0c · 9619b0c
1 parent 7b04694
commit 9619b0c
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 53 deletions.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -36,7 +36,7 @@ Detailed list of changes
 - :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`)
 - :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`)
 - Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`)
-- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`)
+- Path matching is now implemenented in a more efficient manner within :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`)
 - :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`)
 
 
@@ -56,7 +56,7 @@ Detailed list of changes
 - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
 - BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath`  correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`)
 - :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`)
-- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`)
+- Corrects extension when filtering filenames in :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`)
 
 ⚕️ Code health
 ^^^^^^^^^^^^^^

diff --git a/mne_bids/path.py b/mne_bids/path.py
@@ -12,7 +12,6 @@
 from copy import deepcopy
 from datetime import datetime
 from io import StringIO
-from itertools import chain as iter_chain
 from os import path as op
 from pathlib import Path
 from textwrap import indent
@@ -2014,9 +2013,12 @@ def get_entity_vals(
     ignore_suffixes : str | array-like of str | None
         Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for
         ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`.
+
+        .. versionadded:: 0.17
     include_match : str | array-like of str | None
-        Apply a starting match pragma following Unix style pattern syntax from
-        package glob to prefilter search criterion.
+        Glob-style pattern(s) of *directories* to include in the search (i.e., each
+        must end with ``"/"``). ``None`` (the default) is equivalent to ``"**/"``
+        (search within any subdirectory of the BIDS root).
 
         .. versionadded:: 0.17
     with_key : bool
@@ -2117,8 +2119,7 @@ def get_entity_vals(
     search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*"
     if include_match is not None:
         include_match = _ensure_tuple(include_match)
-        filenames = [root.glob(im + search_str) for im in include_match]
-        filenames = iter_chain(*filenames)
+        filenames = [f for im in include_match for f in root.glob(im + search_str)]
     else:
         filenames = root.glob(search_str)
 
@@ -2541,21 +2542,20 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False
     root = Path(root)  # if root is str
 
     if datatype is None and not ignore_nosub:
-        search_str = "*.*"
-        paths = root.rglob(search_str)
+        paths = root.rglob("*.*")
     else:
         if datatype is not None:
             datatype = _ensure_tuple(datatype)
-            search_str = f"**/{'|'.join(datatype)}/.*"
+            search_str = f"**/{'|'.join(datatype)}/*.*"
         else:
             search_str = "**/*.*"
 
         # only browse files which are of the form root/sub-*,
         # such that we truely only look in 'sub'-folders:
-
         if ignore_nosub:
-            search_str = "sub-*/" + search_str
-
+            search_str = f"sub-*/{search_str}"
+        # TODO: Why is this not equivalent to list(root.rglob(search_str)) ?
+        # Most of the speedup is from using glob.iglob here.
         paths = [
             Path(root, fn)
             for fn in glob.iglob(search_str, root_dir=root, recursive=True)

diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py
@@ -12,7 +12,6 @@
 from pathlib import Path
 
 import mne
-import numpy as np
 import pytest
 from mne.datasets import testing
 from mne.io import anonymize_info
@@ -172,8 +171,8 @@ def test_path_benchmark(tmp_path_factory):
     """Benchmark exploring bids tree."""
     # This benchmark is to verify the speed-up in function call get_entity_vals with
     # `include_match=sub-*/` in face of a bids tree hosting derivatives and sourcedata.
-    n_subjects = 20
-    n_sessions = 10
+    n_subjects = 10
+    n_sessions = 5
     n_derivatives = 17
     tmp_bids_root = tmp_path_factory.mktemp("mnebids_utils_test_bids_ds")
 
@@ -221,50 +220,45 @@ def test_path_benchmark(tmp_path_factory):
 
     # apply nosub on find_matching_matchs with root level bids directory should
     # yield a performance boost of order of length from bids_subdirectories.
-    timed_all, timed_ignored_nosub = (
-        timeit.timeit(
-            "mne_bids.find_matching_paths(tmp_bids_root)",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=1,
-        ),
-        timeit.timeit(
-            "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=10,
-        )
-        / 10,
+    setup = "import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'"
+    timed_all = timeit.timeit(
+        "mne_bids.find_matching_paths(tmp_bids_root)", setup=setup, number=1
     )
-
-    assert (
-        timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
-        # while this should be of same order, lets give it some space by a factor of 2
-        > 0.5 * timed_ignored_nosub
+    timed_ignored_nosub = timeit.timeit(
+        "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
+        setup=setup,
+        number=1,
     )
 
+    # while this should be of same order, lets give it some space by a factor of 2
+    target = 2 * timed_all / len(bids_subdirectories)
+    assert timed_ignored_nosub < target
+
     # apply include_match on get_entity_vals with root level bids directory should
     # yield a performance boost of order of length from bids_subdirectories.
-    timed_entity, timed_entity_match = (
-        timeit.timeit(
-            "mne_bids.get_entity_vals(tmp_bids_root, 'session')",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=1,
-        ),
-        timeit.timeit(
-            "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')",  # noqa: E501
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=10,
-        )
-        / 10,
+    timed_entity = timeit.timeit(
+        "mne_bids.get_entity_vals(tmp_bids_root, 'session')",
+        setup=setup,
+        number=1,
     )
-
-    assert (
-        timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
-        # while this should be of same order, lets give it some space by a factor of 2
-        > 0.5 * timed_entity_match
+    timed_entity_match = timeit.timeit(
+        "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')",  # noqa: E501
+        setup=setup,
+        number=1,
     )
 
-    # Clean up
-    shutil.rmtree(tmp_bids_root)
+    # while this should be of same order, lets give it some space by a factor of 2
+    target = 2 * timed_entity / len(bids_subdirectories)
+    assert timed_entity_match < target
+
+    # and these should be equivalent
+    out_1 = get_entity_vals(tmp_bids_root, "session")
+    out_2 = get_entity_vals(tmp_bids_root, "session", include_match="**/")
+    assert out_1 == out_2
+    out_3 = get_entity_vals(tmp_bids_root, "session", include_match="sub-*/")
+    assert out_2 == out_3  # all are sub-* vals
+    out_4 = get_entity_vals(tmp_bids_root, "session", include_match="none/")
+    assert out_4 == []
 
 
 def test_search_folder_for_text(capsys):
@@ -1046,7 +1040,7 @@ def test_filter_fnames(entities, expected_n_matches):
 
 
 @testing.requires_testing_data
-def test_match(return_bids_test_dir):
+def test_match_basic(return_bids_test_dir):
     """Test retrieval of matching basenames."""
     bids_root = Path(return_bids_test_dir)
 
@@ -1140,6 +1134,27 @@ def test_match(return_bids_test_dir):
     bids_path_01.fpath.unlink()  # clean up created file
 
 
+def test_match_advanced(tmp_path):
+    """Test additional match functionality."""
+    bids_root = tmp_path
+    fnames = (
+        "sub-01/nirs/sub-01_task-tapping_events.tsv",
+        "sub-02/nirs/sub-02_task-tapping_events.tsv",
+    )
+    for fname in fnames:
+        this_path = Path(bids_root / fname)
+        this_path.parent.mkdir(parents=True, exist_ok=True)
+        this_path.touch()
+    path = BIDSPath(
+        root=bids_root,
+        datatype="nirs",
+        suffix="events",
+        extension=".tsv",
+    )
+    matches = path.match()
+    assert len(matches) == len(fnames), path
+
+
 @testing.requires_testing_data
 def test_find_matching_paths(return_bids_test_dir):
     """We test by yielding the same results as BIDSPath.match().