From b7a8121d8c3a9813452d31c473beeea09001b7bc Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 13:15:12 -0400 Subject: [PATCH 1/7] MAINT: Report download time and size [circle full] [skip azp] [skip actions] [skip cirrus] --- mne/datasets/_fetch.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mne/datasets/_fetch.py b/mne/datasets/_fetch.py index 87cd1664534..62a265e2dcb 100644 --- a/mne/datasets/_fetch.py +++ b/mne/datasets/_fetch.py @@ -8,6 +8,7 @@ import os.path as op from pathlib import Path from shutil import rmtree +import time from .. import __version__ as mne_version from ..utils import logger, warn, _safe_input @@ -130,6 +131,7 @@ def fetch_dataset( pass a list of dicts. """ # noqa E501 import pooch + t = time.time() if auth is not None: if len(auth) != 2: @@ -241,8 +243,9 @@ def fetch_dataset( registry[archive_name] = dataset_hash # create the download manager + use_path = final_path if processor is None else Path(path) fetcher = pooch.create( - path=str(final_path) if processor is None else path, + path=str(use_path), base_url="", # Full URLs are given in the `urls` dict. version=None, # Data versioning is decoupled from MNE-Python version. urls=urls, @@ -252,6 +255,7 @@ def fetch_dataset( # use our logger level for pooch's logger too pooch.get_logger().setLevel(logger.getEffectiveLevel()) + sz = 0 for idx in range(len(names)): # fetch and unpack the data @@ -268,9 +272,11 @@ def fetch_dataset( 'the dataset to be downloaded again.') from None else: raise + fname = use_path / archive_name + sz += fname.stat().st_size # after unpacking, remove the archive file if processor is not None: - os.remove(op.join(path, archive_name)) + fname.unlink() # remove version number from "misc" and "testing" datasets folder names if name == "misc": @@ -299,4 +305,13 @@ def fetch_dataset( name=name, current=data_version, newest=mne_version ) ) + t = time.time() - t + fmt = '%Ss' + if t > 60: + fmt = f'%Mm{fmt}' + if t > 3600: + fmt = f'%Hh{fmt}' + sz = sz / 1048576 # 1024 ** 2 + t = time.strftime(fmt, time.gmtime(time.time() - t)) + logger.info(f'Download complete in {t} ({sz:.1f} MB)') return (final_path, data_version) if return_version else final_path From 52492e1d56681132a6e99b6c65613089dd5f614f Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 13:52:50 -0400 Subject: [PATCH 2/7] FIX: Better logging [circle full] [skip azp] [skip actions] [skip cirrus] --- mne/datasets/_fetch.py | 10 +++- mne/datasets/eegbci/eegbci.py | 20 +++++-- mne/datasets/limo/limo.py | 23 ++++++-- mne/datasets/sleep_physionet/_utils.py | 25 ++++---- mne/datasets/sleep_physionet/age.py | 27 ++++++--- mne/datasets/sleep_physionet/temazepam.py | 27 ++++++--- mne/datasets/utils.py | 69 ++++++++++++----------- mne/utils/config.py | 1 + 8 files changed, 131 insertions(+), 71 deletions(-) diff --git a/mne/datasets/_fetch.py b/mne/datasets/_fetch.py index 62a265e2dcb..5d669553942 100644 --- a/mne/datasets/_fetch.py +++ b/mne/datasets/_fetch.py @@ -131,7 +131,7 @@ def fetch_dataset( pass a list of dicts. """ # noqa E501 import pooch - t = time.time() + t0 = time.time() if auth is not None: if len(auth) != 2: @@ -305,7 +305,12 @@ def fetch_dataset( name=name, current=data_version, newest=mne_version ) ) - t = time.time() - t + _log_time_size(t0, sz) + return (final_path, data_version) if return_version else final_path + + +def _log_time_size(t0, sz): + t = time.time() - t0 fmt = '%Ss' if t > 60: fmt = f'%Mm{fmt}' @@ -314,4 +319,3 @@ def fetch_dataset( sz = sz / 1048576 # 1024 ** 2 t = time.strftime(fmt, time.gmtime(time.time() - t)) logger.info(f'Download complete in {t} ({sz:.1f} MB)') - return (final_path, data_version) if return_version else final_path diff --git a/mne/datasets/eegbci/eegbci.py b/mne/datasets/eegbci/eegbci.py index d976425dd7a..4ec9a800b07 100644 --- a/mne/datasets/eegbci/eegbci.py +++ b/mne/datasets/eegbci/eegbci.py @@ -7,9 +7,11 @@ import re from os import path as op from pathlib import Path +import time -from ...utils import _url_to_local_path, verbose +from ...utils import _url_to_local_path, verbose, logger from ..utils import _do_path_update, _get_path +from .._fetch import _log_time_size # TODO: remove try/except when our min version is py 3.9 try: @@ -162,6 +164,7 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None, .. footbibliography:: """ # noqa: E501 import pooch + t0 = time.time() if not hasattr(runs, '__iter__'): runs = [runs] @@ -195,14 +198,23 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None, # fetch the file(s) data_paths = [] + sz = 0 for run in runs: file_part = f'S{subject:03d}/S{subject:03d}R{run:02d}.edf' - destination = op.join(base_path, file_part) - if force_update and op.isfile(destination): - os.remove(destination) + destination = Path(op.join(base_path, file_part)) + if destination.exists(): + if force_update: + destination.unlink() + else: + continue + if sz == 0: # log once + logger.info('Downloading EEGBCI data') data_paths.append(fetcher.fetch(file_part)) # update path in config if desired _do_path_update(path, update_path, config_key, name) + sz += destination.stat().st_size + if sz > 0: + _log_time_size(t0, sz) return data_paths diff --git a/mne/datasets/limo/limo.py b/mne/datasets/limo/limo.py index 143a9dd1162..d06e0cecbc8 100644 --- a/mne/datasets/limo/limo.py +++ b/mne/datasets/limo/limo.py @@ -2,8 +2,9 @@ # # License: BSD-3-Clause -import os import os.path as op +from pathlib import Path +import time import numpy as np @@ -12,6 +13,7 @@ from ...io.meas_info import create_info from ...utils import _check_pandas_installed, verbose from ..utils import _get_path, _do_path_update, logger +from .._fetch import _log_time_size # root url for LIMO files @@ -67,6 +69,7 @@ def data_path(subject, path=None, force_update=False, update_path=None, *, .. footbibliography:: """ # noqa: E501 import pooch + t0 = time.time() downloader = pooch.HTTPDownloader(progressbar=True) # use tqdm @@ -168,14 +171,23 @@ def data_path(subject, path=None, force_update=False, update_path=None, *, # use our logger level for pooch's logger too pooch.get_logger().setLevel(logger.getEffectiveLevel()) # fetch the data + sz = 0 for fname in ('LIMO.mat', 'Yr.mat'): - destination = op.join(subject_path, fname) - if force_update and op.isfile(destination): - os.remove(destination) + destination = Path(op.join(subject_path, fname)) + if destination.exists(): + if force_update: + destination.unlink() + else: + continue + if sz == 0: # log once + logger.info('Downloading LIMO data') # fetch the remote file (if local file missing or has hash mismatch) fetcher.fetch(fname=fname, downloader=downloader) + sz += destination.stat().st_size # update path in config if desired _do_path_update(path, update_path, config_key, name) + if sz > 0: + _log_time_size(t0, sz) return base_path @@ -282,7 +294,8 @@ def load_data(subject, path=None, force_update=False, update_path=None, metadata = pd.DataFrame(metadata) # -- 6) Create custom epochs array - epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata) + epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata, + verbose=False) epochs.info['bads'] = missing_chans # missing channels are marked as bad. return epochs diff --git a/mne/datasets/sleep_physionet/_utils.py b/mne/datasets/sleep_physionet/_utils.py index 0c2c0632857..b496e7859dd 100644 --- a/mne/datasets/sleep_physionet/_utils.py +++ b/mne/datasets/sleep_physionet/_utils.py @@ -30,18 +30,19 @@ def _fetch_one(fname, hashsum, path, force_update, base_url): # Fetch the file url = base_url + '/' + fname destination = op.join(path, fname) - if not op.isfile(destination) or force_update: - if op.isfile(destination): - os.remove(destination) - if not op.isdir(op.dirname(destination)): - os.makedirs(op.dirname(destination)) - pooch.retrieve( - url=url, - known_hash=f"sha1:{hashsum}", - path=path, - fname=fname - ) - return destination + if op.isfile(destination) and not force_update: + return destination, False + if op.isfile(destination): + os.remove(destination) + if not op.isdir(op.dirname(destination)): + os.makedirs(op.dirname(destination)) + pooch.retrieve( + url=url, + known_hash=f"sha1:{hashsum}", + path=path, + fname=fname + ) + return destination, True @verbose diff --git a/mne/datasets/sleep_physionet/age.py b/mne/datasets/sleep_physionet/age.py index 4a0d8456639..f7a09a3e872 100644 --- a/mne/datasets/sleep_physionet/age.py +++ b/mne/datasets/sleep_physionet/age.py @@ -3,9 +3,13 @@ # # License: BSD Style. +import os +import time + import numpy as np from ...utils import verbose +from .._fetch import _log_time_size from ._utils import _fetch_one, _data_path, _on_missing, AGE_SLEEP_RECORDS from ._utils import _check_subjects @@ -79,6 +83,7 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False, ---------- .. footbibliography:: """ # noqa: E501 + t0 = time.time() records = np.loadtxt(AGE_SLEEP_RECORDS, skiprows=1, delimiter=',', @@ -107,15 +112,23 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False, _on_missing(on_missing, msg) fnames = [] + sz = 0 for subject in subjects: for idx in np.where(psg_records['subject'] == subject)[0]: if psg_records['record'][idx] in recording: - psg_fname = _fetch_one(psg_records['fname'][idx].decode(), - psg_records['sha'][idx].decode(), - *params) - hyp_fname = _fetch_one(hyp_records['fname'][idx].decode(), - hyp_records['sha'][idx].decode(), - *params) + psg_fname, pdl = _fetch_one( + psg_records['fname'][idx].decode(), + psg_records['sha'][idx].decode(), + *params) + hyp_fname, hdl = _fetch_one( + hyp_records['fname'][idx].decode(), + hyp_records['sha'][idx].decode(), + *params) fnames.append([psg_fname, hyp_fname]) - + if pdl: + sz += os.path.getsize(psg_fname) + if hdl: + sz += os.path.getsize(hyp_fname) + if sz > 0: + _log_time_size(t0, sz) return fnames diff --git a/mne/datasets/sleep_physionet/temazepam.py b/mne/datasets/sleep_physionet/temazepam.py index a18f126ab5f..8c290843f7f 100644 --- a/mne/datasets/sleep_physionet/temazepam.py +++ b/mne/datasets/sleep_physionet/temazepam.py @@ -3,9 +3,13 @@ # # License: BSD Style. +import os +import time + import numpy as np from ...utils import verbose +from .._fetch import _log_time_size from ._utils import _fetch_one, _data_path, TEMAZEPAM_SLEEP_RECORDS from ._utils import _check_subjects @@ -67,6 +71,7 @@ def fetch_data(subjects, path=None, force_update=False, base_url=BASE_URL, *, ---------- .. footbibliography:: """ + t0 = time.time() records = np.loadtxt(TEMAZEPAM_SLEEP_RECORDS, skiprows=1, delimiter=',', @@ -83,15 +88,23 @@ def fetch_data(subjects, path=None, force_update=False, base_url=BASE_URL, *, params = [path, force_update, base_url] fnames = [] + sz = 0 for subject in subjects: # all the subjects are present at this point for idx in np.where(records['subject'] == subject)[0]: if records['record'][idx] == b'Placebo': - psg_fname = _fetch_one(records['psg fname'][idx].decode(), - records['psg sha'][idx].decode(), - *params) - hyp_fname = _fetch_one(records['hyp fname'][idx].decode(), - records['hyp sha'][idx].decode(), - *params) + psg_fname, pdl = _fetch_one( + records['psg fname'][idx].decode(), + records['psg sha'][idx].decode(), + *params) + hyp_fname, hdl = _fetch_one( + records['hyp fname'][idx].decode(), + records['hyp sha'][idx].decode(), + *params) fnames.append([psg_fname, hyp_fname]) - + if pdl: + sz += os.path.getsize(psg_fname) + if hdl: + sz += os.path.getsize(hyp_fname) + if sz > 0: + _log_time_size(t0, sz) return fnames diff --git a/mne/datasets/utils.py b/mne/datasets/utils.py index 50a894bfd7b..0d04f090235 100644 --- a/mne/datasets/utils.py +++ b/mne/datasets/utils.py @@ -299,49 +299,52 @@ def _download_all_example_data(verbose=True): # # verbose=True by default so we get nice status messages. # Consider adding datasets from here to CircleCI for PR-auto-build - from . import (sample, testing, misc, spm_face, somato, brainstorm, - eegbci, multimodal, opm, hf_sef, mtrf, fieldtrip_cmc, - kiloword, phantom_4dbti, sleep_physionet, limo, - fnirs_motor, refmeg_noise, fetch_infant_template, - fetch_fsaverage, ssvep, erp_core, epilepsy_ecog, - fetch_phantom, eyelink, ucl_opm_auditory) - sample_path = sample.data_path() - testing.data_path() - misc.data_path() - spm_face.data_path() - somato.data_path() - hf_sef.data_path() - multimodal.data_path() - fnirs_motor.data_path() - opm.data_path() - mtrf.data_path() - fieldtrip_cmc.data_path() - kiloword.data_path() - phantom_4dbti.data_path() - refmeg_noise.data_path() - ssvep.data_path() - epilepsy_ecog.data_path() - ucl_opm_auditory.data_path() - brainstorm.bst_raw.data_path(accept=True) - brainstorm.bst_auditory.data_path(accept=True) - brainstorm.bst_resting.data_path(accept=True) - phantom_path = brainstorm.bst_phantom_elekta.data_path(accept=True) - fetch_phantom('otaniemi', subjects_dir=phantom_path) - eyelink.data_path() - brainstorm.bst_phantom_ctf.data_path(accept=True) + paths = dict() + for kind in ('sample testing misc spm_face somato hf_sef multimodal ' + 'fnirs_motor opm mtrf fieldtrip_cmc kiloword phantom_4dbti ' + 'refmeg_noise ssvep epilepsy_ecog ucl_opm_auditory eyelink ' + 'erp_core brainstorm.bst_raw brainstorm.bst_auditory ' + 'brainstorm.bst_resting brainstorm.bst_phantom_ctf ' + 'brainstorm.bst_phantom_elekta' + ).split(): + mod = importlib.import_module(f'mne.datasets.{kind}') + data_path_func = getattr(mod, 'data_path') + kwargs = dict() + if 'accept' in inspect.getfullargspec(data_path_func).args: + kwargs['accept'] = True + paths[kind] = data_path_func(**kwargs) + logger.info(f'[done {kind}]') + + # Now for the exceptions: + from . import ( + eegbci, sleep_physionet, limo, fetch_fsaverage, fetch_infant_template, + fetch_hcp_mmp_parcellation, fetch_phantom) eegbci.load_data(1, [6, 10, 14], update_path=True) for subj in range(4): eegbci.load_data(subj + 1, runs=[3], update_path=True) + logger.info('[done eegbci]') + sleep_physionet.age.fetch_data(subjects=[0, 1], recording=[1]) + logger.info('[done sleep_physionet]') + # If the user has SUBJECTS_DIR, respect it, if not, set it to the EEG one # (probably on CircleCI, or otherwise advanced user) fetch_fsaverage(None) + logger.info('[done fsaverage]') + fetch_infant_template('6mo') + logger.info('[done infant_template]') + fetch_hcp_mmp_parcellation( - subjects_dir=sample_path / 'subjects', accept=True) - limo.load_data(subject=1, update_path=True) + subjects_dir=paths['sample'] / 'subjects', accept=True) + logger.info('[done hcp_mmp_parcellation]') + + fetch_phantom( + 'otaniemi', subjects_dir=paths['brainstorm.bst_phantom_elekta']) + logger.info('[done phantom]') - erp_core.data_path() + limo.load_data(subject=1, update_path=True) + logger.info('[done limo]') @verbose diff --git a/mne/utils/config.py b/mne/utils/config.py index 5056fcfd18a..09a89fe9a0f 100644 --- a/mne/utils/config.py +++ b/mne/utils/config.py @@ -104,6 +104,7 @@ def set_memmap_min_size(memmap_min_size): 'MNE_DATASETS_BRAINSTORM_PATH', 'MNE_DATASETS_EEGBCI_PATH', 'MNE_DATASETS_EPILEPSY_ECOG_PATH', + 'MNE_DATASETS_EYELINK_PATH', 'MNE_DATASETS_HF_SEF_PATH', 'MNE_DATASETS_MEGSIM_PATH', 'MNE_DATASETS_MISC_PATH', From 0d6446c769868226a1c057c9830de770540569a9 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 15:36:18 -0400 Subject: [PATCH 3/7] FIX: Better logging [circle full] [skip azp] [skip actions] [skip cirrus] --- mne/datasets/_fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mne/datasets/_fetch.py b/mne/datasets/_fetch.py index 5d669553942..57c11385b0c 100644 --- a/mne/datasets/_fetch.py +++ b/mne/datasets/_fetch.py @@ -317,5 +317,5 @@ def _log_time_size(t0, sz): if t > 3600: fmt = f'%Hh{fmt}' sz = sz / 1048576 # 1024 ** 2 - t = time.strftime(fmt, time.gmtime(time.time() - t)) + t = time.strftime(fmt, time.gmtime(t)) logger.info(f'Download complete in {t} ({sz:.1f} MB)') From 73208888024010af0aac8d106afefbeec91b326c Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 15:49:18 -0400 Subject: [PATCH 4/7] FIX: Better logging [circle full] [skip azp] [skip actions] [skip cirrus] --- mne/datasets/_fetch.py | 22 ++------------ mne/datasets/eegbci/eegbci.py | 8 +++-- mne/datasets/limo/limo.py | 8 ++--- mne/datasets/sleep_physionet/_utils.py | 12 ++++++-- mne/datasets/sleep_physionet/age.py | 2 +- mne/datasets/sleep_physionet/temazepam.py | 2 +- mne/datasets/utils.py | 37 +++++++++++++++++++++-- tools/circleci_download.sh | 1 + 8 files changed, 59 insertions(+), 33 deletions(-) diff --git a/mne/datasets/_fetch.py b/mne/datasets/_fetch.py index 57c11385b0c..578c1cf82ed 100644 --- a/mne/datasets/_fetch.py +++ b/mne/datasets/_fetch.py @@ -2,7 +2,6 @@ # # License: BSD Style. -import logging import sys import os import os.path as op @@ -18,7 +17,8 @@ TESTING_VERSIONED, MISC_VERSIONED, ) -from .utils import _dataset_version, _do_path_update, _get_path +from .utils import (_dataset_version, _do_path_update, _get_path, + _log_time_size, _downloader_params) from ..fixes import _compare_version @@ -222,13 +222,9 @@ def fetch_dataset( "You must agree to the license to use this " "dataset" ) # downloader & processors - download_params = dict(progressbar=logger.level <= logging.INFO) + download_params = _downloader_params(auth=auth, token=token) if name == "fake": download_params["progressbar"] = False - if auth is not None: - download_params["auth"] = auth - if token is not None: - download_params["headers"] = {"Authorization": f"token {token}"} downloader = pooch.HTTPDownloader(**download_params) # make mappings from archive names to urls and to checksums @@ -307,15 +303,3 @@ def fetch_dataset( ) _log_time_size(t0, sz) return (final_path, data_version) if return_version else final_path - - -def _log_time_size(t0, sz): - t = time.time() - t0 - fmt = '%Ss' - if t > 60: - fmt = f'%Mm{fmt}' - if t > 3600: - fmt = f'%Hh{fmt}' - sz = sz / 1048576 # 1024 ** 2 - t = time.strftime(fmt, time.gmtime(t)) - logger.info(f'Download complete in {t} ({sz:.1f} MB)') diff --git a/mne/datasets/eegbci/eegbci.py b/mne/datasets/eegbci/eegbci.py index 4ec9a800b07..925f8b772a1 100644 --- a/mne/datasets/eegbci/eegbci.py +++ b/mne/datasets/eegbci/eegbci.py @@ -10,8 +10,8 @@ import time from ...utils import _url_to_local_path, verbose, logger -from ..utils import _do_path_update, _get_path -from .._fetch import _log_time_size +from ..utils import (_do_path_update, _get_path, _log_time_size, + _downloader_params) # TODO: remove try/except when our min version is py 3.9 try: @@ -81,6 +81,7 @@ def data_path(url, path=None, force_update=False, update_path=None, *, destinations = [destination] # Fetch the file + downloader = pooch.HTTPDownloader(**_downloader_params()) if not op.isfile(destination) or force_update: if op.isfile(destination): os.remove(destination) @@ -90,7 +91,8 @@ def data_path(url, path=None, force_update=False, update_path=None, *, # URL to one of Pooch's test files url=url, path=destination, - fname=fname + downloader=downloader, + fname=fname, ) # Offer to update the path diff --git a/mne/datasets/limo/limo.py b/mne/datasets/limo/limo.py index d06e0cecbc8..3701fdb4515 100644 --- a/mne/datasets/limo/limo.py +++ b/mne/datasets/limo/limo.py @@ -11,9 +11,9 @@ from ...channels import make_standard_montage from ...epochs import EpochsArray from ...io.meas_info import create_info -from ...utils import _check_pandas_installed, verbose -from ..utils import _get_path, _do_path_update, logger -from .._fetch import _log_time_size +from ...utils import _check_pandas_installed, verbose, logger +from ..utils import (_get_path, _do_path_update, _log_time_size, + _downloader_params) # root url for LIMO files @@ -71,7 +71,7 @@ def data_path(subject, path=None, force_update=False, update_path=None, *, import pooch t0 = time.time() - downloader = pooch.HTTPDownloader(progressbar=True) # use tqdm + downloader = pooch.HTTPDownloader(**_downloader_params()) # local storage patch config_key = 'MNE_DATASETS_LIMO_PATH' diff --git a/mne/datasets/sleep_physionet/_utils.py b/mne/datasets/sleep_physionet/_utils.py index b496e7859dd..50f992e7803 100644 --- a/mne/datasets/sleep_physionet/_utils.py +++ b/mne/datasets/sleep_physionet/_utils.py @@ -10,7 +10,7 @@ from ...utils import (verbose, _TempDir, _check_pandas_installed, _on_missing) -from ..utils import _get_path +from ..utils import _get_path, _downloader_params AGE_SLEEP_RECORDS = op.join(op.dirname(__file__), 'age_records.csv') TEMAZEPAM_SLEEP_RECORDS = op.join(op.dirname(__file__), @@ -36,10 +36,12 @@ def _fetch_one(fname, hashsum, path, force_update, base_url): os.remove(destination) if not op.isdir(op.dirname(destination)): os.makedirs(op.dirname(destination)) + downloader = pooch.HTTPDownloader(**_downloader_params()) pooch.retrieve( url=url, known_hash=f"sha1:{hashsum}", path=path, + downloader=downloader, fname=fname ) return destination, True @@ -88,11 +90,13 @@ def _update_sleep_temazepam_records(fname=TEMAZEPAM_SLEEP_RECORDS): # Download subjects info. subjects_fname = op.join(tmp, 'ST-subjects.xls') + downloader = pooch.HTTPDownloader(**_downloader_params()) pooch.retrieve( url=TEMAZEPAM_RECORDS_URL, known_hash=f"sha1:{TEMAZEPAM_RECORDS_URL_SHA1}", path=tmp, - fname=op.basename(subjects_fname) + downloader=downloader, + fname=op.basename(subjects_fname), ) # Load and Massage the checksums. @@ -147,11 +151,13 @@ def _update_sleep_age_records(fname=AGE_SLEEP_RECORDS): # Download subjects info. subjects_fname = op.join(tmp, 'SC-subjects.xls') + downloader = pooch.HTTPDownloader(**_downloader_params()) pooch.retrieve( url=AGE_RECORDS_URL, known_hash=f"sha1:{AGE_RECORDS_URL_SHA1}", path=tmp, - fname=op.basename(subjects_fname) + downloader=downloader, + fname=op.basename(subjects_fname), ) # Load and Massage the checksums. diff --git a/mne/datasets/sleep_physionet/age.py b/mne/datasets/sleep_physionet/age.py index f7a09a3e872..106d39d4e32 100644 --- a/mne/datasets/sleep_physionet/age.py +++ b/mne/datasets/sleep_physionet/age.py @@ -9,7 +9,7 @@ import numpy as np from ...utils import verbose -from .._fetch import _log_time_size +from ..utils import _log_time_size from ._utils import _fetch_one, _data_path, _on_missing, AGE_SLEEP_RECORDS from ._utils import _check_subjects diff --git a/mne/datasets/sleep_physionet/temazepam.py b/mne/datasets/sleep_physionet/temazepam.py index 8c290843f7f..841dbe67a7f 100644 --- a/mne/datasets/sleep_physionet/temazepam.py +++ b/mne/datasets/sleep_physionet/temazepam.py @@ -9,7 +9,7 @@ import numpy as np from ...utils import verbose -from .._fetch import _log_time_size +from ..utils import _log_time_size from ._utils import _fetch_one, _data_path, TEMAZEPAM_SLEEP_RECORDS from ._utils import _check_subjects diff --git a/mne/datasets/utils.py b/mne/datasets/utils.py index 0d04f090235..1fba832abb0 100644 --- a/mne/datasets/utils.py +++ b/mne/datasets/utils.py @@ -11,12 +11,14 @@ from collections import OrderedDict import importlib import inspect +import logging import os import os.path as op from pathlib import Path import sys -import zipfile +import time import tempfile +import zipfile import numpy as np @@ -374,6 +376,7 @@ def fetch_aparc_sub_parcellation(subjects_dir=None, verbose=None): rh='https://osf.io/4kxny/download') hashes = dict(lh='9e4d8d6b90242b7e4b0145353436ef77', rh='dd6464db8e7762d969fc1d8087cd211b') + downloader = pooch.HTTPDownloader(**_downloader_params()) for hemi in ('lh', 'rh'): fname = f'{hemi}.aparc_sub.annot' fpath = destination / fname @@ -382,6 +385,7 @@ def fetch_aparc_sub_parcellation(subjects_dir=None, verbose=None): url=urls[hemi], known_hash=f"md5:{hashes[hemi]}", path=destination, + downloader=downloader, fname=fname, ) @@ -433,6 +437,7 @@ def fetch_hcp_mmp_parcellation(subjects_dir=None, combine=True, *, if answer.lower() != 'y': raise RuntimeError('You must agree to the license to use this ' 'dataset') + downloader = pooch.HTTPDownloader(**_downloader_params()) for hemi, fpath in zip(('lh', 'rh'), fnames): if not op.isfile(fpath): fname = fpath.name @@ -440,6 +445,7 @@ def fetch_hcp_mmp_parcellation(subjects_dir=None, combine=True, *, url=urls[hemi], known_hash=f"md5:{hashes[hemi]}", path=destination, + downloader=downloader, fname=fname, ) @@ -556,6 +562,7 @@ def _manifest_check_download(manifest_path, destination, url, hash_): logger.info('%d file%s missing from %s in %s' % (len(need), _pl(need), manifest_path, destination)) if len(need) > 0: + downloader = pooch.HTTPDownloader(**_downloader_params()) with tempfile.TemporaryDirectory() as path: logger.info('Downloading missing files remotely') @@ -564,7 +571,8 @@ def _manifest_check_download(manifest_path, destination, url, hash_): url=url, known_hash=f"md5:{hash_}", path=path, - fname=op.basename(fname_path) + downloader=downloader, + fname=op.basename(fname_path), ) logger.info('Extracting missing file%s' % (_pl(need),)) @@ -578,3 +586,28 @@ def _manifest_check_download(manifest_path, destination, url, hash_): ff.extract(name, path=destination) logger.info('Successfully extracted %d file%s' % (len(need), _pl(need))) + + +def _log_time_size(t0, sz): + t = time.time() - t0 + fmt = '%Ss' + if t > 60: + fmt = f'%Mm{fmt}' + if t > 3600: + fmt = f'%Hh{fmt}' + sz = sz / 1048576 # 1024 ** 2 + t = time.strftime(fmt, time.gmtime(t)) + logger.info(f'Download complete in {t} ({sz:.1f} MB)') + + +def _downloader_params(*, auth=None, token=None): + params = dict() + params['progressbar'] = ( + logger.level <= logging.INFO and + get_config('MNE_TQDM', 'tqdm.auto') != 'off' + ) + if auth is not None: + params["auth"] = auth + if token is not None: + params["headers"] = {"Authorization": f"token {token}"} + return params diff --git a/tools/circleci_download.sh b/tools/circleci_download.sh index 421f6f63ec1..cb622cb1860 100755 --- a/tools/circleci_download.sh +++ b/tools/circleci_download.sh @@ -1,6 +1,7 @@ #!/bin/bash -e set -o pipefail +export MNE_TQDM=off if [ "$CIRCLE_BRANCH" == "main" ] || [[ $(cat gitlog.txt) == *"[circle full]"* ]]; then echo "Doing a full dev build"; From f23fca7511a63f6282e45f89d18ed3721ae9a746 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 16:19:37 -0400 Subject: [PATCH 5/7] FIX: Zenodo slow-doh [circle full] [skip azp] [skip actions] [skip cirrus] --- .circleci/config.yml | 4 ++++ mne/datasets/config.py | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ab70c684e4d..9cbb54338d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -230,6 +230,10 @@ jobs: - data-cache-ucl-opm-auditory - run: name: Get data + # This limit could be increased, but this is helpful for finding slow ones + # (even ~2GB datasets should be downloadable in this time from good + # providers) + no_output_timeout: 10m command: | ./tools/circleci_download.sh - run: diff --git a/mne/datasets/config.py b/mne/datasets/config.py index dc851e9bd2f..e84d63b41c4 100644 --- a/mne/datasets/config.py +++ b/mne/datasets/config.py @@ -320,8 +320,10 @@ MNE_DATASETS['hf_sef_evoked'] = dict( archive_name='hf_sef_evoked.tar.gz', hash='md5:13d34cb5db584e00868677d8fb0aab2b', - url=('https://zenodo.org/record/3523071/files/' - 'hf_sef_evoked.tar.gz'), + # Zenodo can be slow, so we use the OSF mirror + # url=('https://zenodo.org/record/3523071/files/' + # 'hf_sef_evoked.tar.gz'), + url='https://osf.io/25f8d/download?version=2', folder_name='hf_sef', config_key='MNE_DATASETS_HF_SEF_PATH', ) From abbd68a6e8e65087613fbebc4cff6f2edc9442ef Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 16:43:31 -0400 Subject: [PATCH 6/7] TST: Run CIs [circle front] From 871e4fbab27cdd149d4b36bbf8a89bf2ce5aa3dd Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 18 Apr 2023 18:38:53 -0400 Subject: [PATCH 7/7] FIX: Tests [circle front] --- mne/datasets/eegbci/eegbci.py | 3 +- mne/datasets/limo/limo.py | 2 +- .../sleep_physionet/tests/test_physionet.py | 48 ++++++++----------- mne/datasets/tests/test_datasets.py | 2 +- 4 files changed, 24 insertions(+), 31 deletions(-) diff --git a/mne/datasets/eegbci/eegbci.py b/mne/datasets/eegbci/eegbci.py index 925f8b772a1..e89ae089fcc 100644 --- a/mne/datasets/eegbci/eegbci.py +++ b/mne/datasets/eegbci/eegbci.py @@ -88,7 +88,6 @@ def data_path(url, path=None, force_update=False, update_path=None, *, if not op.isdir(op.dirname(destination)): os.makedirs(op.dirname(destination)) pooch.retrieve( - # URL to one of Pooch's test files url=url, path=destination, downloader=downloader, @@ -203,7 +202,7 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None, sz = 0 for run in runs: file_part = f'S{subject:03d}/S{subject:03d}R{run:02d}.edf' - destination = Path(op.join(base_path, file_part)) + destination = Path(base_path, file_part) if destination.exists(): if force_update: destination.unlink() diff --git a/mne/datasets/limo/limo.py b/mne/datasets/limo/limo.py index 3701fdb4515..e0f1d0f9fa9 100644 --- a/mne/datasets/limo/limo.py +++ b/mne/datasets/limo/limo.py @@ -173,7 +173,7 @@ def data_path(subject, path=None, force_update=False, update_path=None, *, # fetch the data sz = 0 for fname in ('LIMO.mat', 'Yr.mat'): - destination = Path(op.join(subject_path, fname)) + destination = Path(subject_path, fname) if destination.exists(): if force_update: destination.unlink() diff --git a/mne/datasets/sleep_physionet/tests/test_physionet.py b/mne/datasets/sleep_physionet/tests/test_physionet.py index 3f754b863ac..549963cb73f 100644 --- a/mne/datasets/sleep_physionet/tests/test_physionet.py +++ b/mne/datasets/sleep_physionet/tests/test_physionet.py @@ -3,11 +3,9 @@ # # License: BSD Style. -import os.path as op -import numpy as np +from pathlib import Path import pytest -from numpy.testing import assert_array_equal import pooch from mne.utils import requires_good_network @@ -32,14 +30,15 @@ def __init__(self): def __call__(self, *args, **kwargs): self.call_args_list.append((args, kwargs)) + Path(kwargs['path'], kwargs['fname']).write_text('test') @property def call_count(self): return len(self.call_args_list) -def _keep_basename_only(path_structure): - return np.vectorize(op.basename)(np.array(path_structure)) +def _keep_basename_only(paths): + return [Path(p).name for p in paths] def _get_expected_url(name): @@ -49,7 +48,7 @@ def _get_expected_url(name): def _get_expected_path(base, name): - return op.join(base, name) + return Path(base, name) def _check_mocked_function_calls(mocked_func, call_fname_hash_pairs, @@ -62,8 +61,8 @@ def _check_mocked_function_calls(mocked_func, call_fname_hash_pairs, for idx, current in enumerate(call_fname_hash_pairs): _, call_kwargs = mocked_func.call_args_list[idx] hash_type, hash = call_kwargs['known_hash'].split(':') - assert call_kwargs['url'] == _get_expected_url(current['name']) - assert op.join(call_kwargs['path'], call_kwargs['fname']) == \ + assert call_kwargs['url'] == _get_expected_url(current['name']), idx + assert Path(call_kwargs['path'], call_kwargs['fname']) == \ _get_expected_path(base_path, current['name']) assert hash == current['hash'] assert hash_type == 'sha1' @@ -130,26 +129,24 @@ def test_sleep_physionet_age(physionet_tmpdir, monkeypatch, download_is_error): monkeypatch.setattr(pooch, 'retrieve', my_func) paths = age.fetch_data(subjects=[0], recording=[1], path=physionet_tmpdir) - assert_array_equal(_keep_basename_only(paths), - [['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf']]) + assert _keep_basename_only(paths[0]) == \ + ['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'] paths = age.fetch_data(subjects=[0, 1], recording=[1], path=physionet_tmpdir) - assert_array_equal(_keep_basename_only(paths), - [['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'], - ['SC4011E0-PSG.edf', 'SC4011EH-Hypnogram.edf']]) + assert _keep_basename_only(paths[0]) == \ + ['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'] + assert _keep_basename_only(paths[1]) == \ + ['SC4011E0-PSG.edf', 'SC4011EH-Hypnogram.edf'] paths = age.fetch_data(subjects=[0], recording=[1, 2], path=physionet_tmpdir) - assert_array_equal(_keep_basename_only(paths), - [['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'], - ['SC4002E0-PSG.edf', 'SC4002EC-Hypnogram.edf']]) + assert _keep_basename_only(paths[0]) == \ + ['SC4001E0-PSG.edf', 'SC4001EC-Hypnogram.edf'] + assert _keep_basename_only(paths[1]) == \ + ['SC4002E0-PSG.edf', 'SC4002EC-Hypnogram.edf'] EXPECTED_CALLS = ( - {'name': 'SC4001E0-PSG.edf', - 'hash': 'adabd3b01fc7bb75c523a974f38ee3ae4e57b40f'}, - {'name': 'SC4001EC-Hypnogram.edf', - 'hash': '21c998eadc8b1e3ea6727d3585186b8f76e7e70b'}, {'name': 'SC4001E0-PSG.edf', 'hash': 'adabd3b01fc7bb75c523a974f38ee3ae4e57b40f'}, {'name': 'SC4001EC-Hypnogram.edf', @@ -158,14 +155,11 @@ def test_sleep_physionet_age(physionet_tmpdir, monkeypatch, download_is_error): 'hash': '4d17451f7847355bcab17584de05e7e1df58c660'}, {'name': 'SC4011EH-Hypnogram.edf', 'hash': 'd582a3cbe2db481a362af890bc5a2f5ca7c878dc'}, - {'name': 'SC4001E0-PSG.edf', - 'hash': 'adabd3b01fc7bb75c523a974f38ee3ae4e57b40f'}, - {'name': 'SC4001EC-Hypnogram.edf', - 'hash': '21c998eadc8b1e3ea6727d3585186b8f76e7e70b'}, {'name': 'SC4002E0-PSG.edf', 'hash': 'c6b6d7a8605cc7e7602b6028ee77f6fbf5f7581d'}, {'name': 'SC4002EC-Hypnogram.edf', - 'hash': '386230188a3552b1fc90bba0fb7476ceaca174b6'}) + 'hash': '386230188a3552b1fc90bba0fb7476ceaca174b6'}, + ) base_path = age.data_path(path=physionet_tmpdir) _check_mocked_function_calls(my_func, EXPECTED_CALLS, base_path) @@ -192,8 +186,8 @@ def test_sleep_physionet_temazepam(physionet_tmpdir, monkeypatch): monkeypatch.setattr(pooch, 'retrieve', my_func) paths = temazepam.fetch_data(subjects=[0], path=physionet_tmpdir) - assert_array_equal(_keep_basename_only(paths), - [['ST7011J0-PSG.edf', 'ST7011JP-Hypnogram.edf']]) + assert _keep_basename_only(paths[0]) == \ + ['ST7011J0-PSG.edf', 'ST7011JP-Hypnogram.edf'] EXPECTED_CALLS = ( {'name': 'ST7011J0-PSG.edf', diff --git a/mne/datasets/tests/test_datasets.py b/mne/datasets/tests/test_datasets.py index 8709b934326..46c1ecd229f 100644 --- a/mne/datasets/tests/test_datasets.py +++ b/mne/datasets/tests/test_datasets.py @@ -189,7 +189,7 @@ def test_fetch_parcellations(tmp_path): _zip_fnames = ['foo/foo.txt', 'foo/bar.txt', 'foo/baz.txt'] -def _fake_zip_fetch(url, path, fname, known_hash): +def _fake_zip_fetch(url, path, fname, *args, **kwargs): fname = op.join(path, fname) with zipfile.ZipFile(fname, 'w') as zipf: with zipf.open('foo/', 'w'):