diff --git a/dandi/cli/cmd_organize.py b/dandi/cli/cmd_organize.py index 77a3cb819..e6be46bf4 100644 --- a/dandi/cli/cmd_organize.py +++ b/dandi/cli/cmd_organize.py @@ -34,6 +34,22 @@ default="auto", show_default=True, ) +@click.option( + "--update-external-file-paths", + is_flag=True, + default=False, + help="Rewrite the 'external_file' arguments of ImageSeries in NWB files. " + "The new values will correspond to the new locations of the video files " + "after being organized. " + "This option requires --files-mode to be 'copy' or 'move'", +) +@click.option( + "--media-files-mode", + type=click.Choice(["copy", "move", "symlink", "hardlink"]), + default=None, + help="This option works on the video files on disc while being organized " + "along side nwb files.", +) @click.argument("paths", nargs=-1, type=click.Path(exists=True)) @devel_debug_option() @map_to_click_exceptions @@ -43,6 +59,8 @@ def organize( invalid="fail", files_mode="auto", devel_debug=False, + update_external_file_paths=False, + media_files_mode=None, ): """(Re)organize files according to the metadata. @@ -80,11 +98,13 @@ def organize( from ..dandiset import Dandiset from ..metadata import get_metadata from ..organize import ( + _create_external_file_names, create_unique_filenames_from_metadata, detect_link_type, filter_invalid_metadata_rows, + organize_external_files, ) - from ..pynwb_utils import ignore_benign_pynwb_warnings + from ..pynwb_utils import ignore_benign_pynwb_warnings, rename_nwb_external_files from ..utils import Parallel, copy_file, delayed, find_files, load_jsonl, move_file in_place = False # If we deduce that we are organizing in-place @@ -104,6 +124,11 @@ def act(func, *args, **kwargs): lgr.debug("%s %s %s", func.__name__, args, kwargs) return func(*args, **kwargs) + if update_external_file_paths and files_mode not in ["copy", "move"]: + raise click.UsageError( + "--files-mode needs to be one of 'copy/move' for the rewrite option to work" + ) + if dandiset_path is None: dandiset = Dandiset.find(os.curdir) if not dandiset: @@ -140,7 +165,7 @@ def act(func, *args, **kwargs): "Only 'dry' or 'move' mode could be used to operate in-place " "within a dandiset (no paths were provided)" ) - lgr.info(f"We will organize {dandiset_path} in-place") + lgr.info("We will organize %s in-place", dandiset_path) in_place = True paths = dandiset_path @@ -214,6 +239,37 @@ def _get_metadata(path): metadata = create_unique_filenames_from_metadata(metadata) + # update metadata with external_file information: + external_files_missing_in_nwbfiles = [ + len(m["external_file_objects"]) == 0 for m in metadata + ] + + if all(external_files_missing_in_nwbfiles) and update_external_file_paths: + lgr.warning( + "--update-external-file-paths specified but no external_files found " + "linked to any nwbfile found in %s", + paths, + ) + elif not all(external_files_missing_in_nwbfiles) and not update_external_file_paths: + files_list = [ + metadata[no]["path"] + for no, a in enumerate(external_files_missing_in_nwbfiles) + if not a + ] + raise click.UsageError( + "--update-external-file-paths option not specified but found " + "external video files linked to the nwbfiles " + f"{', '.join(files_list)}" + ) + + if update_external_file_paths and media_files_mode is None: + media_files_mode = "symlink" + lgr.warning( + "--media-files-mode not specified, setting to recommended mode: 'symlink' " + ) + + metadata = _create_external_file_names(metadata) + # Verify first that the target paths do not exist yet, and fail if they do # Note: in "simulate" mode we do early check as well, so this would be # duplicate but shouldn't hurt @@ -313,10 +369,15 @@ def _get_metadata(path): if op.exists(d): try: os.rmdir(d) - lgr.info(f"Removed empty directory {d}") + lgr.info("Removed empty directory %s", d) except Exception as exc: lgr.debug("Failed to remove directory %s: %s", d, exc) + # create video file name and re write nwb file external files: + if update_external_file_paths: + rename_nwb_external_files(metadata, dandiset_path) + organize_external_files(metadata, dandiset_path, media_files_mode) + def msg_(msg, n, cond=None): if hasattr(n, "__len__"): n = len(n) diff --git a/dandi/consts.py b/dandi/consts.py index 9d2d87d47..535d6e986 100644 --- a/dandi/consts.py +++ b/dandi/consts.py @@ -148,6 +148,9 @@ class DandiInstance(NamedTuple): #: of retries) RETRY_STATUSES = (500, 502, 503, 504) +VIDEO_FILE_EXTENSIONS = [".mp4", ".avi", ".wmv", ".mov", ".flv"] +VIDEO_FILE_MODULES = ["processing", "acquisition"] + #: Maximum allowed depth of a Zarr directory tree MAX_ZARR_DEPTH = 5 diff --git a/dandi/organize.py b/dandi/organize.py index 6b8699fd1..95a8342a5 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -9,13 +9,15 @@ import os.path as op from pathlib import Path import re +from typing import List +import uuid import numpy as np from . import get_logger from .exceptions import OrganizeImpossibleError from .pynwb_utils import get_neurodata_types_to_modalities_map, get_object_id -from .utils import ensure_datetime, flattened, yaml_load +from .utils import copy_file, ensure_datetime, flattened, move_file, yaml_load lgr = get_logger() @@ -172,6 +174,84 @@ def create_unique_filenames_from_metadata(metadata): return metadata +def _create_external_file_names(metadata: List[dict]) -> List[dict]: + """Updates the metadata dict with renamed external files. + + Renames the external_file attribute in an ImageSeries according to the rule: + /_external_file_<.ext> + Example, the Initial name of file: + external_file = [name1.mp4] + rename to: + external_file = [dandiset-path-of-nwbfile/ + dandi-renamed-nwbfile_name(folder without extension .nwb)/ + f'{ImageSeries.object_id}_external_file_0.mp4' + This is stored in a new field in the metadata: + metadata['external_file_objects'][0]['external_files_renamed'] = + + Parameters + ---------- + metadata: list + list of metadata dictionaries created during the call to pynwb_utils._get_pynwb_metadata + Returns + ------- + metadata: list + updated list of metadata dictionaries + """ + metadata = deepcopy(metadata) + for meta in metadata: + if "dandi_path" not in meta or "external_file_objects" not in meta: + continue + nwb_folder_name = op.splitext(op.basename(meta["dandi_path"]))[0] + for ext_file_dict in meta["external_file_objects"]: + renamed_path_list = [] + uuid_str = ext_file_dict.get("id", str(uuid.uuid4())) + for no, ext_file in enumerate(ext_file_dict["external_files"]): + renamed = op.join( + nwb_folder_name, f"{uuid_str}_external_file_{no}{ext_file.suffix}" + ) + renamed_path_list.append(renamed) + ext_file_dict["external_files_renamed"] = renamed_path_list + return metadata + + +def organize_external_files( + metadata: List[dict], dandiset_path: str, files_mode: str +) -> None: + """Organizes the external_files into the new Dandiset folder structure. + + Parameters + ---------- + metadata: list + list of metadata dictionaries created during the call to pynwb_utils._get_pynwb_metadata + dandiset_path: str + full path of the main dandiset folder. + files_mode: str + one of "symlink", "copy", "move", "hardlink" + + """ + for e in metadata: + for ext_file_dict in e["external_file_objects"]: + for no, (name_old, name_new) in enumerate( + zip( + ext_file_dict["external_files"], + ext_file_dict["external_files_renamed"], + ) + ): + new_path = op.join(dandiset_path, op.dirname(e["dandi_path"]), name_new) + name_old_str = str(name_old) + os.makedirs(op.dirname(new_path), exist_ok=True) + if files_mode == "symlink": + os.symlink(name_old_str, new_path) + elif files_mode == "hardlink": + os.link(name_old_str, new_path) + elif files_mode == "copy": + copy_file(name_old_str, new_path) + elif files_mode == "move": + move_file(name_old_str, new_path) + else: + raise NotImplementedError(files_mode) + + def _assign_obj_id(metadata, non_unique): msg = "%d out of %d paths are not unique" % (len(non_unique), len(metadata)) diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index a9326a744..b1efed435 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -17,6 +17,8 @@ from . import __version__, get_logger from .consts import ( + VIDEO_FILE_EXTENSIONS, + VIDEO_FILE_MODULES, metadata_nwb_computed_fields, metadata_nwb_file_fields, metadata_nwb_subject_fields, @@ -230,9 +232,92 @@ def _get_pynwb_metadata(path: Union[str, Path]) -> Dict[str, Any]: key = f[len("number_of_") :] out[f] = len(getattr(nwb, key, []) or []) + # get external_file data: + out["external_file_objects"] = _get_image_series(nwb) + + return out + + +def _get_image_series(nwb: pynwb.NWBFile) -> List[dict]: + """Retrieves all ImageSeries related metadata from an open nwb file. + + Specifically it pulls out the ImageSeries uuid, name and all the + externally linked files named under the argument 'external_file'. + + Parameters + ---------- + nwb: pynwb.NWBFile + + Returns + ------- + out: List[dict] + list of dicts : [{id: , name: , + external_files=[ImageSeries.external_file]}] + if no ImageSeries found in the given modules to check, then it returns an empty list. + """ + out = [] + for module_name in VIDEO_FILE_MODULES: + module_cont = getattr(nwb, module_name) + for name, ob in module_cont.items(): + if isinstance(ob, pynwb.image.ImageSeries) and ob.external_file is not None: + out_dict = dict(id=ob.object_id, name=ob.name, external_files=[]) + for ext_file in ob.external_file: + if Path(ext_file).suffix in VIDEO_FILE_EXTENSIONS: + out_dict["external_files"].append(Path(ext_file)) + else: + lgr.warning( + "external file %s should be one of: %s", + ext_file, + ", ".join(VIDEO_FILE_EXTENSIONS), + ) + out.append(out_dict) return out +def rename_nwb_external_files(metadata: List[dict], dandiset_path: str) -> None: + """Renames the external_file attribute in an ImageSeries datatype in an open nwb file. + + It pulls information about the ImageSeries objects from metadata: + metadata["external_file_objects"] populated during _get_pynwb_metadata() call. + + Parameters + ---------- + metadata: List[dict] + list of dictionaries containing the metadata gathered from the nwbfile + dandiset_path: str + base path of dandiset + """ + for meta in metadata: + if not all(i in meta for i in ["path", "dandi_path", "external_file_objects"]): + lgr.warning( + "could not rename external files, update metadata " + 'with "path", "dandi_path", "external_file_objects"' + ) + return + dandiset_nwbfile_path = op.join(dandiset_path, meta["dandi_path"]) + with NWBHDF5IO(dandiset_nwbfile_path, mode="r+", load_namespaces=True) as io: + nwb = io.read() + for ext_file_dict in meta["external_file_objects"]: + # retrieve nwb neurodata object of the given object id: + container_list = [ + child + for child in nwb.children + if ext_file_dict["id"] == child.object_id + ] + if len(container_list) == 0: + continue + else: + container = container_list[0] + # rename all external files: + for no, (name_old, name_new) in enumerate( + zip( + ext_file_dict["external_files"], + ext_file_dict["external_files_renamed"], + ) + ): + container.external_file[no] = str(name_new) + + @validate_cache.memoize_path def validate(path: Union[str, Path], devel_debug: bool = False) -> List[str]: """Run validation on a file and return errors diff --git a/dandi/tests/fixtures.py b/dandi/tests/fixtures.py index af07fad86..7e9e6e50f 100644 --- a/dandi/tests/fixtures.py +++ b/dandi/tests/fixtures.py @@ -17,9 +17,14 @@ from _pytest.fixtures import FixtureRequest from click.testing import CliRunner from dandischema.consts import DANDI_SCHEMA_VERSION -from dateutil.tz import tzutc +from dateutil.tz import tzlocal, tzutc import numpy as np import pynwb +from pynwb import NWBHDF5IO, NWBFile +from pynwb.device import Device +from pynwb.file import Subject +import pynwb.image +from pynwb.ophys import ImageSeries import pytest import requests import zarr @@ -390,6 +395,81 @@ def zarr_dandiset(new_dandiset: SampleDandiset) -> SampleDandiset: return new_dandiset +@pytest.fixture() +def video_nwbfiles(tmp_path): + video_paths = [] + import cv2 + + video_path = tmp_path / "video_files" + video_path.mkdir() + for no in range(2): + movie_file1 = video_path / f"test1_{no}.avi" + movie_file2 = video_path / f"test2_{no}.avi" + (nf, nx, ny) = (5, 10, 20) + writer1 = cv2.VideoWriter( + filename=str(movie_file1), + apiPreference=None, + fourcc=cv2.VideoWriter_fourcc(*"DIVX"), + fps=25, + frameSize=(ny, nx), + params=None, + ) + writer2 = cv2.VideoWriter( + filename=str(movie_file2), + apiPreference=None, + fourcc=cv2.VideoWriter_fourcc(*"DIVX"), + fps=25, + frameSize=(ny, nx), + params=None, + ) + for k in range(nf): + writer1.write(np.random.randint(0, 255, (nx, ny, 3)).astype("uint8")) + writer2.write(np.random.randint(0, 255, (nx, ny, 3)).astype("uint8")) + writer1.release() + writer2.release() + video_paths.append((movie_file1, movie_file2)) + base_nwb_path = tmp_path / "nwbfiles" + base_nwb_path.mkdir(parents=True, exist_ok=True) + + for no, vid_loc in enumerate(video_paths): + vid_1 = vid_loc[0] + vid_2 = vid_loc[1] + subject_id = f"mouse{no}" + session_id = f"sessionid{no}" + subject = Subject( + subject_id=subject_id, + species="Mus musculus", + sex="M", + description="lab mouse ", + ) + device = Device(f"imaging_device_{no}") + name = f"{vid_1.stem}_{no}" + nwbfile = NWBFile( + f"{name}{no}", + "desc: contains movie for dandi .mp4 storage as external", + datetime.now(tzlocal()), + experimenter="Experimenter name", + session_id=session_id, + subject=subject, + devices=[device], + ) + + image_series = ImageSeries( + name=f"MouseWhiskers{no}", + format="external", + external_file=[str(vid_1), str(vid_2)], + starting_frame=[0], + starting_time=0.0, + rate=150.0, + ) + nwbfile.add_acquisition(image_series) + + nwbfile_path = base_nwb_path / f"{name}.nwb" + with NWBHDF5IO(str(nwbfile_path), "w") as io: + io.write(nwbfile) + return base_nwb_path + + @pytest.fixture() def tmp_home( monkeypatch: pytest.MonkeyPatch, tmp_path_factory: pytest.TempPathFactory diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py index 14e6c940e..7346219eb 100644 --- a/dandi/tests/test_metadata.py +++ b/dandi/tests/test_metadata.py @@ -38,6 +38,7 @@ def test_get_metadata(simple1_nwb: str, simple1_nwb_metadata: Dict[str, Any]) -> target_metadata["number_of_units"] = 0 # We also populate with nd_types now, although here they would be empty target_metadata["nd_types"] = [] + target_metadata["external_file_objects"] = [] # we do not populate any subject fields in our simple1_nwb for f in metadata_nwb_subject_fields: target_metadata[f] = None diff --git a/dandi/tests/test_organize.py b/dandi/tests/test_organize.py index 5031f3ad9..f23bac4d4 100644 --- a/dandi/tests/test_organize.py +++ b/dandi/tests/test_organize.py @@ -5,6 +5,7 @@ from typing import Any, NoReturn from click.testing import CliRunner +from pynwb import NWBHDF5IO import pytest import ruamel.yaml @@ -18,7 +19,7 @@ get_obj_id, populate_dataset_yml, ) -from ..pynwb_utils import copy_nwb_file, get_object_id +from ..pynwb_utils import _get_image_series, copy_nwb_file, get_object_id from ..utils import find_files, on_windows, yaml_load @@ -256,3 +257,42 @@ def error_link(src: Any, dest: Any) -> NoReturn: monkeypatch.setattr(os, "symlink", succeed_link if sym_success else error_link) monkeypatch.setattr(os, "link", succeed_link if hard_success else error_link) assert detect_link_type(tmp_path) == result + + +@pytest.mark.parametrize("mode", ["copy", "move"]) +@pytest.mark.parametrize("video_mode", ["copy", "move", "symlink", "hardlink"]) +def test_video_organize(video_mode, mode, video_nwbfiles): + dandi_organize_path = video_nwbfiles.parent / "dandi_organized" + cmd = [ + "--files-mode", + mode, + "--update-external-file-paths", + "--media-files-mode", + video_mode, + "-d", + str(dandi_organize_path), + str(video_nwbfiles), + ] + video_files_list = list((video_nwbfiles.parent / "video_files").iterdir()) + video_files_organized = [] + r = CliRunner().invoke(organize, cmd) + assert r.exit_code == 0 + for nwbfile_name in dandi_organize_path.glob("**/*.nwb"): + vid_folder = nwbfile_name.with_suffix("") + assert vid_folder.exists() + with NWBHDF5IO(str(nwbfile_name), "r", load_namespaces=True) as io: + nwbfile = io.read() + # get iamgeseries objects as dict(id=object_id, external_files=[]) + ext_file_objects = _get_image_series(nwbfile) + for ext_file_ob in ext_file_objects: + for no, name in enumerate(ext_file_ob["external_files"]): + video_files_organized.append(name) + # check if external_file arguments are correctly named according to convention: + filename = Path( + f"{vid_folder.name}/{ext_file_ob['id']}_external_file_{no}" + ) + assert str(filename) == str(Path(name).with_suffix("")) + # check if the files exist( both in case of move/copy): + assert (vid_folder.parent / name).exists() + # check all video files are organized: + assert len(video_files_list) == len(video_files_organized) diff --git a/setup.cfg b/setup.cfg index 6474fade5..8bb3fe00a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -80,6 +80,7 @@ style = test = anys ~= 0.2 coverage + opencv-python pytest pytest-cov pytest-mock diff --git a/tox.ini b/tox.ini index 675b6f954..d8a0ea252 100644 --- a/tox.ini +++ b/tox.ini @@ -57,6 +57,7 @@ filterwarnings = ignore:\s*safe_load will be removed.*:PendingDeprecationWarning:hdmf ignore:\s*load will be removed.*:PendingDeprecationWarning:ruamel.yaml ignore:Passing None into shape arguments.*:DeprecationWarning:h5py + ignore:the imp module is deprecated:DeprecationWarning ignore:`Unit` has been deprecated:DeprecationWarning:humanize ignore:The distutils package is deprecated:DeprecationWarning:joblib ignore:.*Value with data type .* is being converted:hdmf.build.warnings.DtypeConversionWarning