Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: Report download time and size #11635

Merged
merged 7 commits into from
Apr 18, 2023
Merged
4 changes: 4 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,10 @@ jobs:
- data-cache-ucl-opm-auditory
- run:
name: Get data
# This limit could be increased, but this is helpful for finding slow ones
# (even ~2GB datasets should be downloadable in this time from good
# providers)
no_output_timeout: 10m
command: |
./tools/circleci_download.sh
- run:
Expand Down
21 changes: 12 additions & 9 deletions mne/datasets/_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
#
# License: BSD Style.

import logging
import sys
import os
import os.path as op
from pathlib import Path
from shutil import rmtree
import time

from .. import __version__ as mne_version
from ..utils import logger, warn, _safe_input
Expand All @@ -17,7 +17,8 @@
TESTING_VERSIONED,
MISC_VERSIONED,
)
from .utils import _dataset_version, _do_path_update, _get_path
from .utils import (_dataset_version, _do_path_update, _get_path,
_log_time_size, _downloader_params)
from ..fixes import _compare_version


Expand Down Expand Up @@ -130,6 +131,7 @@ def fetch_dataset(
pass a list of dicts.
""" # noqa E501
import pooch
t0 = time.time()

if auth is not None:
if len(auth) != 2:
Expand Down Expand Up @@ -220,13 +222,9 @@ def fetch_dataset(
"You must agree to the license to use this " "dataset"
)
# downloader & processors
download_params = dict(progressbar=logger.level <= logging.INFO)
download_params = _downloader_params(auth=auth, token=token)
if name == "fake":
download_params["progressbar"] = False
if auth is not None:
download_params["auth"] = auth
if token is not None:
download_params["headers"] = {"Authorization": f"token {token}"}
downloader = pooch.HTTPDownloader(**download_params)

# make mappings from archive names to urls and to checksums
Expand All @@ -241,8 +239,9 @@ def fetch_dataset(
registry[archive_name] = dataset_hash

# create the download manager
use_path = final_path if processor is None else Path(path)
fetcher = pooch.create(
path=str(final_path) if processor is None else path,
path=str(use_path),
base_url="", # Full URLs are given in the `urls` dict.
version=None, # Data versioning is decoupled from MNE-Python version.
urls=urls,
Expand All @@ -252,6 +251,7 @@ def fetch_dataset(

# use our logger level for pooch's logger too
pooch.get_logger().setLevel(logger.getEffectiveLevel())
sz = 0

for idx in range(len(names)):
# fetch and unpack the data
Expand All @@ -268,9 +268,11 @@ def fetch_dataset(
'the dataset to be downloaded again.') from None
else:
raise
fname = use_path / archive_name
sz += fname.stat().st_size
# after unpacking, remove the archive file
if processor is not None:
os.remove(op.join(path, archive_name))
fname.unlink()

# remove version number from "misc" and "testing" datasets folder names
if name == "misc":
Expand Down Expand Up @@ -299,4 +301,5 @@ def fetch_dataset(
name=name, current=data_version, newest=mne_version
)
)
_log_time_size(t0, sz)
return (final_path, data_version) if return_version else final_path
6 changes: 4 additions & 2 deletions mne/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,10 @@
MNE_DATASETS['hf_sef_evoked'] = dict(
archive_name='hf_sef_evoked.tar.gz',
hash='md5:13d34cb5db584e00868677d8fb0aab2b',
url=('https://zenodo.org/record/3523071/files/'
'hf_sef_evoked.tar.gz'),
# Zenodo can be slow, so we use the OSF mirror
# url=('https://zenodo.org/record/3523071/files/'
# 'hf_sef_evoked.tar.gz'),
url='https://osf.io/25f8d/download?version=2',
folder_name='hf_sef',
config_key='MNE_DATASETS_HF_SEF_PATH',
)
Expand Down
27 changes: 20 additions & 7 deletions mne/datasets/eegbci/eegbci.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import re
from os import path as op
from pathlib import Path
import time

from ...utils import _url_to_local_path, verbose
from ..utils import _do_path_update, _get_path
from ...utils import _url_to_local_path, verbose, logger
from ..utils import (_do_path_update, _get_path, _log_time_size,
_downloader_params)

# TODO: remove try/except when our min version is py 3.9
try:
Expand Down Expand Up @@ -79,16 +81,17 @@ def data_path(url, path=None, force_update=False, update_path=None, *,
destinations = [destination]

# Fetch the file
downloader = pooch.HTTPDownloader(**_downloader_params())
if not op.isfile(destination) or force_update:
if op.isfile(destination):
os.remove(destination)
if not op.isdir(op.dirname(destination)):
os.makedirs(op.dirname(destination))
pooch.retrieve(
# URL to one of Pooch's test files
url=url,
path=destination,
fname=fname
downloader=downloader,
fname=fname,
)

# Offer to update the path
Expand Down Expand Up @@ -162,6 +165,7 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None,
.. footbibliography::
""" # noqa: E501
import pooch
t0 = time.time()

if not hasattr(runs, '__iter__'):
runs = [runs]
Expand Down Expand Up @@ -195,14 +199,23 @@ def load_data(subject, runs, path=None, force_update=False, update_path=None,

# fetch the file(s)
data_paths = []
sz = 0
for run in runs:
file_part = f'S{subject:03d}/S{subject:03d}R{run:02d}.edf'
destination = op.join(base_path, file_part)
if force_update and op.isfile(destination):
os.remove(destination)
destination = Path(base_path, file_part)
if destination.exists():
if force_update:
destination.unlink()
else:
continue
if sz == 0: # log once
logger.info('Downloading EEGBCI data')
data_paths.append(fetcher.fetch(file_part))
# update path in config if desired
_do_path_update(path, update_path, config_key, name)
sz += destination.stat().st_size
if sz > 0:
_log_time_size(t0, sz)
return data_paths


Expand Down
29 changes: 21 additions & 8 deletions mne/datasets/limo/limo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
#
# License: BSD-3-Clause

import os
import os.path as op
from pathlib import Path
import time

import numpy as np

from ...channels import make_standard_montage
from ...epochs import EpochsArray
from ...io.meas_info import create_info
from ...utils import _check_pandas_installed, verbose
from ..utils import _get_path, _do_path_update, logger
from ...utils import _check_pandas_installed, verbose, logger
from ..utils import (_get_path, _do_path_update, _log_time_size,
_downloader_params)


# root url for LIMO files
Expand Down Expand Up @@ -67,8 +69,9 @@ def data_path(subject, path=None, force_update=False, update_path=None, *,
.. footbibliography::
""" # noqa: E501
import pooch
t0 = time.time()

downloader = pooch.HTTPDownloader(progressbar=True) # use tqdm
downloader = pooch.HTTPDownloader(**_downloader_params())

# local storage patch
config_key = 'MNE_DATASETS_LIMO_PATH'
Expand Down Expand Up @@ -168,14 +171,23 @@ def data_path(subject, path=None, force_update=False, update_path=None, *,
# use our logger level for pooch's logger too
pooch.get_logger().setLevel(logger.getEffectiveLevel())
# fetch the data
sz = 0
for fname in ('LIMO.mat', 'Yr.mat'):
destination = op.join(subject_path, fname)
if force_update and op.isfile(destination):
os.remove(destination)
destination = Path(subject_path, fname)
if destination.exists():
if force_update:
destination.unlink()
else:
continue
if sz == 0: # log once
logger.info('Downloading LIMO data')
# fetch the remote file (if local file missing or has hash mismatch)
fetcher.fetch(fname=fname, downloader=downloader)
sz += destination.stat().st_size
# update path in config if desired
_do_path_update(path, update_path, config_key, name)
if sz > 0:
_log_time_size(t0, sz)
return base_path


Expand Down Expand Up @@ -282,7 +294,8 @@ def load_data(subject, path=None, force_update=False, update_path=None,
metadata = pd.DataFrame(metadata)

# -- 6) Create custom epochs array
epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata)
epochs = EpochsArray(data, info, events, tmin, event_id, metadata=metadata,
verbose=False)
epochs.info['bads'] = missing_chans # missing channels are marked as bad.

return epochs
37 changes: 22 additions & 15 deletions mne/datasets/sleep_physionet/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from ...utils import (verbose, _TempDir, _check_pandas_installed,
_on_missing)
from ..utils import _get_path
from ..utils import _get_path, _downloader_params

AGE_SLEEP_RECORDS = op.join(op.dirname(__file__), 'age_records.csv')
TEMAZEPAM_SLEEP_RECORDS = op.join(op.dirname(__file__),
Expand All @@ -30,18 +30,21 @@ def _fetch_one(fname, hashsum, path, force_update, base_url):
# Fetch the file
url = base_url + '/' + fname
destination = op.join(path, fname)
if not op.isfile(destination) or force_update:
if op.isfile(destination):
os.remove(destination)
if not op.isdir(op.dirname(destination)):
os.makedirs(op.dirname(destination))
pooch.retrieve(
url=url,
known_hash=f"sha1:{hashsum}",
path=path,
fname=fname
)
return destination
if op.isfile(destination) and not force_update:
return destination, False
if op.isfile(destination):
os.remove(destination)
if not op.isdir(op.dirname(destination)):
os.makedirs(op.dirname(destination))
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=url,
known_hash=f"sha1:{hashsum}",
path=path,
downloader=downloader,
fname=fname
)
return destination, True


@verbose
Expand Down Expand Up @@ -87,11 +90,13 @@ def _update_sleep_temazepam_records(fname=TEMAZEPAM_SLEEP_RECORDS):

# Download subjects info.
subjects_fname = op.join(tmp, 'ST-subjects.xls')
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=TEMAZEPAM_RECORDS_URL,
known_hash=f"sha1:{TEMAZEPAM_RECORDS_URL_SHA1}",
path=tmp,
fname=op.basename(subjects_fname)
downloader=downloader,
fname=op.basename(subjects_fname),
)

# Load and Massage the checksums.
Expand Down Expand Up @@ -146,11 +151,13 @@ def _update_sleep_age_records(fname=AGE_SLEEP_RECORDS):

# Download subjects info.
subjects_fname = op.join(tmp, 'SC-subjects.xls')
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=AGE_RECORDS_URL,
known_hash=f"sha1:{AGE_RECORDS_URL_SHA1}",
path=tmp,
fname=op.basename(subjects_fname)
downloader=downloader,
fname=op.basename(subjects_fname),
)

# Load and Massage the checksums.
Expand Down
27 changes: 20 additions & 7 deletions mne/datasets/sleep_physionet/age.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
#
# License: BSD Style.

import os
import time

import numpy as np

from ...utils import verbose
from ..utils import _log_time_size
from ._utils import _fetch_one, _data_path, _on_missing, AGE_SLEEP_RECORDS
from ._utils import _check_subjects

Expand Down Expand Up @@ -79,6 +83,7 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False,
----------
.. footbibliography::
""" # noqa: E501
t0 = time.time()
records = np.loadtxt(AGE_SLEEP_RECORDS,
skiprows=1,
delimiter=',',
Expand Down Expand Up @@ -107,15 +112,23 @@ def fetch_data(subjects, recording=(1, 2), path=None, force_update=False,
_on_missing(on_missing, msg)

fnames = []
sz = 0
for subject in subjects:
for idx in np.where(psg_records['subject'] == subject)[0]:
if psg_records['record'][idx] in recording:
psg_fname = _fetch_one(psg_records['fname'][idx].decode(),
psg_records['sha'][idx].decode(),
*params)
hyp_fname = _fetch_one(hyp_records['fname'][idx].decode(),
hyp_records['sha'][idx].decode(),
*params)
psg_fname, pdl = _fetch_one(
psg_records['fname'][idx].decode(),
psg_records['sha'][idx].decode(),
*params)
hyp_fname, hdl = _fetch_one(
hyp_records['fname'][idx].decode(),
hyp_records['sha'][idx].decode(),
*params)
fnames.append([psg_fname, hyp_fname])

if pdl:
sz += os.path.getsize(psg_fname)
if hdl:
sz += os.path.getsize(hyp_fname)
if sz > 0:
_log_time_size(t0, sz)
return fnames
Loading