mne-tools · larsoner · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023
@@ -230,6 +230,10 @@ jobs:
               - data-cache-ucl-opm-auditory
         - run:
             name: Get data
+            # This limit could be increased, but this is helpful for finding slow ones
+            # (even ~2GB datasets should be downloadable in this time from good
+            # providers)
+            no_output_timeout: 10m
             command: |
               ./tools/circleci_download.sh
         - run:

diff --git a/mne/datasets/_fetch.py b/mne/datasets/_fetch.py
@@ -2,12 +2,12 @@
 #
 # License: BSD Style.
 
-import logging
 import sys
 import os
 import os.path as op
 from pathlib import Path
 from shutil import rmtree
+import time
 
 from .. import __version__ as mne_version
 from ..utils import logger, warn, _safe_input
@@ -17,7 +17,8 @@
     TESTING_VERSIONED,
     MISC_VERSIONED,
 )
-from .utils import _dataset_version, _do_path_update, _get_path
+from .utils import (_dataset_version, _do_path_update, _get_path,
+                    _log_time_size, _downloader_params)
 from ..fixes import _compare_version
 
 
@@ -130,6 +131,7 @@ def fetch_dataset(
     pass a list of dicts.
     """  # noqa E501
     import pooch
+    t0 = time.time()
 
     if auth is not None:
         if len(auth) != 2:
@@ -220,13 +222,9 @@ def fetch_dataset(
                     "You must agree to the license to use this " "dataset"
                 )
     # downloader & processors
-    download_params = dict(progressbar=logger.level <= logging.INFO)
+    download_params = _downloader_params(auth=auth, token=token)
     if name == "fake":
         download_params["progressbar"] = False
-    if auth is not None:
-        download_params["auth"] = auth
-    if token is not None:
-        download_params["headers"] = {"Authorization": f"token {token}"}
     downloader = pooch.HTTPDownloader(**download_params)
 
     # make mappings from archive names to urls and to checksums
@@ -241,8 +239,9 @@ def fetch_dataset(
         registry[archive_name] = dataset_hash
 
     # create the download manager
+    use_path = final_path if processor is None else Path(path)
     fetcher = pooch.create(
-        path=str(final_path) if processor is None else path,
+        path=str(use_path),
         base_url="",  # Full URLs are given in the `urls` dict.
         version=None,  # Data versioning is decoupled from MNE-Python version.
         urls=urls,
@@ -252,6 +251,7 @@ def fetch_dataset(
 
     # use our logger level for pooch's logger too
     pooch.get_logger().setLevel(logger.getEffectiveLevel())
+    sz = 0
 
     for idx in range(len(names)):
         # fetch and unpack the data
@@ -268,9 +268,11 @@ def fetch_dataset(
                     'the dataset to be downloaded again.') from None
             else:
                 raise
+        fname = use_path / archive_name
+        sz += fname.stat().st_size
         # after unpacking, remove the archive file
         if processor is not None:
-            os.remove(op.join(path, archive_name))
+            fname.unlink()
 
     # remove version number from "misc" and "testing" datasets folder names
     if name == "misc":
@@ -299,4 +301,5 @@ def fetch_dataset(
                 name=name, current=data_version, newest=mne_version
             )
         )
+    _log_time_size(t0, sz)
     return (final_path, data_version) if return_version else final_path
diff --git a/mne/datasets/config.py b/mne/datasets/config.py
@@ -320,8 +320,10 @@
 MNE_DATASETS['hf_sef_evoked'] = dict(
     archive_name='hf_sef_evoked.tar.gz',
     hash='md5:13d34cb5db584e00868677d8fb0aab2b',
-    url=('https://zenodo.org/record/3523071/files/'
-         'hf_sef_evoked.tar.gz'),
+    # Zenodo can be slow, so we use the OSF mirror
+    # url=('https://zenodo.org/record/3523071/files/'
+    #      'hf_sef_evoked.tar.gz'),
+    url='https://osf.io/25f8d/download?version=2',
     folder_name='hf_sef',
     config_key='MNE_DATASETS_HF_SEF_PATH',
 )

diff --git a/mne/datasets/eegbci/eegbci.py b/mne/datasets/eegbci/eegbci.py
@@ -7,9 +7,11 @@
 import re
 from os import path as op
 from pathlib import Path
+import time
 
-from ...utils import _url_to_local_path, verbose
-from ..utils import _do_path_update, _get_path
+from ...utils import _url_to_local_path, verbose, logger
+from ..utils import (_do_path_update, _get_path, _log_time_size,
+                     _downloader_params)
 
 # TODO: remove try/except when our min version is py 3.9
 try:
@@ -79,16 +81,17 @@ def data_path(url, path=None, force_update=False, update_path=None, *,
     destinations = [destination]
 
     # Fetch the file
+    downloader = pooch.HTTPDownloader(**_downloader_params())
     if not op.isfile(destination) or force_update:
         if op.isfile(destination):
             os.remove(destination)
         if not op.isdir(op.dirname(destination)):
             os.makedirs(op.dirname(destination))
         pooch.retrieve(
-            # URL to one of Pooch's test files
             url=url,
             path=destination,
-            fname=fname
+            downloader=downloader,
+            fname=fname,
         )
 
     # Offer to update the path
@@ -162,6 +165,7 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None,
     .. footbibliography::
     """  # noqa: E501
     import pooch
+    t0 = time.time()
 
     if not hasattr(runs, '__iter__'):
         runs = [runs]
@@ -195,14 +199,23 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None,
 
     # fetch the file(s)
     data_paths = []
+    sz = 0
     for run in runs:
         file_part = f'S{subject:03d}/S{subject:03d}R{run:02d}.edf'
-        destination = op.join(base_path, file_part)
-        if force_update and op.isfile(destination):
-            os.remove(destination)
+        destination = Path(base_path, file_part)
+        if destination.exists():
+            if force_update:
+                destination.unlink()
+            else:
+                continue
+        if sz == 0:  # log once
+            logger.info('Downloading EEGBCI data')
         data_paths.append(fetcher.fetch(file_part))
         # update path in config if desired
         _do_path_update(path, update_path, config_key, name)
+        sz += destination.stat().st_size
+    if sz > 0:
+        _log_time_size(t0, sz)
     return data_paths
 
 

diff --git a/mne/datasets/limo/limo.py b/mne/datasets/limo/limo.py
@@ -2,16 +2,18 @@
 #
 # License: BSD-3-Clause
 
-import os
 import os.path as op
+from pathlib import Path
+import time
 
 import numpy as np
 
 from ...channels import make_standard_montage
 from ...epochs import EpochsArray
 from ...io.meas_info import create_info
-from ...utils import _check_pandas_installed, verbose
-from ..utils import _get_path, _do_path_update, logger
+from ...utils import _check_pandas_installed, verbose, logger
+from ..utils import (_get_path, _do_path_update, _log_time_size,
+                     _downloader_params)
 
 
 # root url for LIMO files
@@ -67,8 +69,9 @@ def data_path(subject, path=None, force_update=False, update_path=None, *,
     .. footbibliography::
     """  # noqa: E501
     import pooch
+    t0 = time.time()
 
-    downloader = pooch.HTTPDownloader(progressbar=True)  # use tqdm
+    downloader = pooch.HTTPDownloader(**_downloader_params())
 
     # local storage patch
     config_key = 'MNE_DATASETS_LIMO_PATH'
@@ -168,14 +171,23 @@ def data_path(subject, path=None, force_update=False, update_path=None, *,
     # use our logger level for pooch's logger too
     pooch.get_logger().setLevel(logger.getEffectiveLevel())
     # fetch the data
+    sz = 0
     for fname in ('LIMO.mat', 'Yr.mat'):
-        destination = op.join(subject_path, fname)
-        if force_update and op.isfile(destination):
-            os.remove(destination)
+        destination = Path(subject_path, fname)
+        if destination.exists():
+            if force_update:
+                destination.unlink()
+            else:
+                continue
+        if sz == 0:  # log once
+            logger.info('Downloading LIMO data')
         # fetch the remote file (if local file missing or has hash mismatch)
         fetcher.fetch(fname=fname, downloader=downloader)
+        sz += destination.stat().st_size
     # update path in config if desired
     _do_path_update(path, update_path, config_key, name)
+    if sz > 0:
+        _log_time_size(t0, sz)
     return base_path
 
 
@@ -282,7 +294,8 @@ def load_data(subject, path=None, force_update=False, update_path=None,
     metadata = pd.DataFrame(metadata)
 
     # -- 6) Create custom epochs array
-    epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata)
+    epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata,
+                         verbose=False)
     epochs.info['bads'] = missing_chans  # missing channels are marked as bad.
 
     return epochs
diff --git a/mne/datasets/sleep_physionet/_utils.py b/mne/datasets/sleep_physionet/_utils.py
@@ -10,7 +10,7 @@
 
 from ...utils import (verbose, _TempDir, _check_pandas_installed,
                       _on_missing)
-from ..utils import _get_path
+from ..utils import _get_path, _downloader_params
 
 AGE_SLEEP_RECORDS = op.join(op.dirname(__file__), 'age_records.csv')
 TEMAZEPAM_SLEEP_RECORDS = op.join(op.dirname(__file__),
@@ -30,18 +30,21 @@ def _fetch_one(fname, hashsum, path, force_update, base_url):
     # Fetch the file
     url = base_url + '/' + fname
     destination = op.join(path, fname)
-    if not op.isfile(destination) or force_update:
-        if op.isfile(destination):
-            os.remove(destination)
-        if not op.isdir(op.dirname(destination)):
-            os.makedirs(op.dirname(destination))
-        pooch.retrieve(
-            url=url,
-            known_hash=f"sha1:{hashsum}",
-            path=path,
-            fname=fname
-        )
-    return destination
+    if op.isfile(destination) and not force_update:
+        return destination, False
+    if op.isfile(destination):
+        os.remove(destination)
+    if not op.isdir(op.dirname(destination)):
+        os.makedirs(op.dirname(destination))
+    downloader = pooch.HTTPDownloader(**_downloader_params())
+    pooch.retrieve(
+        url=url,
+        known_hash=f"sha1:{hashsum}",
+        path=path,
+        downloader=downloader,
+        fname=fname
+    )
+    return destination, True
 
 
 @verbose
@@ -87,11 +90,13 @@ def _update_sleep_temazepam_records(fname=TEMAZEPAM_SLEEP_RECORDS):
 
     # Download subjects info.
     subjects_fname = op.join(tmp, 'ST-subjects.xls')
+    downloader = pooch.HTTPDownloader(**_downloader_params())
     pooch.retrieve(
         url=TEMAZEPAM_RECORDS_URL,
         known_hash=f"sha1:{TEMAZEPAM_RECORDS_URL_SHA1}",
         path=tmp,
-        fname=op.basename(subjects_fname)
+        downloader=downloader,
+        fname=op.basename(subjects_fname),
     )
 
     # Load and Massage the checksums.
@@ -146,11 +151,13 @@ def _update_sleep_age_records(fname=AGE_SLEEP_RECORDS):
 
     # Download subjects info.
     subjects_fname = op.join(tmp, 'SC-subjects.xls')
+    downloader = pooch.HTTPDownloader(**_downloader_params())
     pooch.retrieve(
         url=AGE_RECORDS_URL,
         known_hash=f"sha1:{AGE_RECORDS_URL_SHA1}",
         path=tmp,
-        fname=op.basename(subjects_fname)
+        downloader=downloader,
+        fname=op.basename(subjects_fname),
     )
 
     # Load and Massage the checksums.

diff --git a/mne/datasets/sleep_physionet/age.py b/mne/datasets/sleep_physionet/age.py
@@ -3,9 +3,13 @@
 #
 # License: BSD Style.
 
+import os
+import time
+
 import numpy as np
 
 from ...utils import verbose
+from ..utils import _log_time_size
 from ._utils import _fetch_one, _data_path, _on_missing, AGE_SLEEP_RECORDS
 from ._utils import _check_subjects
 
@@ -79,6 +83,7 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False,
     ----------
     .. footbibliography::
     """  # noqa: E501
+    t0 = time.time()
     records = np.loadtxt(AGE_SLEEP_RECORDS,
                          skiprows=1,
                          delimiter=',',
@@ -107,15 +112,23 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False,
         _on_missing(on_missing, msg)
 
     fnames = []
+    sz = 0
     for subject in subjects:
         for idx in np.where(psg_records['subject'] == subject)[0]:
             if psg_records['record'][idx] in recording:
-                psg_fname = _fetch_one(psg_records['fname'][idx].decode(),
-                                       psg_records['sha'][idx].decode(),
-                                       *params)
-                hyp_fname = _fetch_one(hyp_records['fname'][idx].decode(),
-                                       hyp_records['sha'][idx].decode(),
-                                       *params)
+                psg_fname, pdl = _fetch_one(
+                    psg_records['fname'][idx].decode(),
+                    psg_records['sha'][idx].decode(),
+                    *params)
+                hyp_fname, hdl = _fetch_one(
+                    hyp_records['fname'][idx].decode(),
+                    hyp_records['sha'][idx].decode(),
+                    *params)
                 fnames.append([psg_fname, hyp_fname])
-
+                if pdl:
+                    sz += os.path.getsize(psg_fname)
+                if hdl:
+                    sz += os.path.getsize(hyp_fname)
+    if sz > 0:
+        _log_time_size(t0, sz)
     return fnames