Skip to content

Commit

Permalink
Fix plantseg volumes - remove slices with artifacts (#344)
Browse files Browse the repository at this point in the history
* Fix plantseg - remove slices with artifacts

* Refactor slicing logic to ignore normal volumes

* Remove debugging scripts

* Fix removal of redundant volume in root train

* Minor fix to dataset type name
  • Loading branch information
anwai98 authored Sep 22, 2024
1 parent b691a67 commit 404ab17
Showing 1 changed file with 59 additions and 1 deletion.
60 changes: 59 additions & 1 deletion torch_em/data/datasets/light_microscopy/plantseg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,18 @@

import os
from glob import glob
from tqdm import tqdm
from typing import List, Optional, Tuple, Union

import torch_em
import h5py

from torch.utils.data import Dataset, DataLoader

import torch_em

from .. import util


URLS = {
"root": {
"train": "https://files.de-1.osf.io/v1/resources/9x3g2/providers/osfstorage/?zip=",
Expand Down Expand Up @@ -48,6 +54,27 @@
# "test": "a7272f6ad1d765af6d121e20f436ac4f3609f1a90b1cb2346aa938d8c52800b9",
}
}

CROPPING_VOLUMES = {
# root (train)
"Movie2_T00006_crop_gt.h5": slice(4, None),
"Movie2_T00008_crop_gt.h5": slice(None, -18),
"Movie2_T00010_crop_gt.h5": slice(None, -32),
"Movie2_T00012_crop_gt.h5": slice(None, -39),
"Movie2_T00014_crop_gt.h5": slice(None, -40),
"Movie2_T00016_crop_gt.h5": slice(None, -42),
# root (test)
"Movie2_T00020_crop_gt.h5": slice(None, -50),
# ovules (train)
"N_487_ds2x.h5": slice(17, None),
"N_535_ds2x.h5": slice(None, -1),
"N_534_ds2x.h5": slice(None, -1),
"N_451_ds2x.h5": slice(None, -1),
"N_425_ds2x.h5": slice(None, -1),
# ovules (val)
"N_420_ds2x.h5": slice(None, -1),
}

# The resolution previous used for the resizing
# I have removed this feature since it was not reliable,
# but leaving this here for reference
Expand All @@ -56,6 +83,36 @@
# NATIVE_RESOLUTION = (0.235, 0.075, 0.075)


def _fix_inconsistent_volumes(data_path, name, split):
file_paths = glob(os.path.join(data_path, "*.h5"))
if name not in ["root", "ovules"] and split not in ["train", "val"]:
return

for vol_path in tqdm(file_paths, desc="Fixing inconsistencies in volumes"):
fname = os.path.basename(vol_path)

# avoid duplicated volumes in 'train' and 'test'.
if fname == "Movie1_t00045_crop_gt.h5" and (name == "root" and split == "train"):
os.remove(vol_path)
continue

if fname not in CROPPING_VOLUMES:
continue

with h5py.File(vol_path, "r+") as f:
raw, labels = f["raw"], f["label"]

crop_slices = CROPPING_VOLUMES[fname]
resized_raw, resized_labels = raw[:][crop_slices], labels[:][crop_slices]

cropped_shape = resized_raw.shape
raw.resize(cropped_shape)
labels.resize(cropped_shape)

raw[...] = resized_raw
labels[...] = resized_labels


def get_plantseg_data(path: Union[os.PathLike, str], download: bool, name: str, split: str) -> str:
"""Download the PlantSeg training data.
Expand All @@ -77,6 +134,7 @@ def get_plantseg_data(path: Union[os.PathLike, str], download: bool, name: str,
tmp_path = os.path.join(path, f"{name}_{split}.zip")
util.download_source(tmp_path, url, download, checksum)
util.unzip(tmp_path, out_path, remove=True)
_fix_inconsistent_volumes(out_path, name, split)
return out_path


Expand Down

0 comments on commit 404ab17

Please sign in to comment.