Skip to content

Commit

Permalink
Merge pull request #7 from rmnldwg/release-0.1.0
Browse files Browse the repository at this point in the history
Release 0.1.0
  • Loading branch information
rmnldwg authored Oct 28, 2024
2 parents 65bb8b5 + 6bf14ae commit 3bbb3c4
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 26 deletions.
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,26 @@

All notable changes to this project will be documented in this file.

## [0.1.0] - 2024-10-28

### 🚀 Features

- *(utils)* Add often needed `enhance` function to complete sub-/superlevel involvement and infer maximum likelihood status.

### 🐛 Bug Fixes

- Avoid `KeyError` in `infer_superlevels`

### ⚙️ Miscellaneous Tasks

- Add link to release 0.0.4

### Change

- `infer_su(b|per)levels` skips inferring involvement of sub-/super LNLs that are already present
- *(load)* Rename `skip_disk` to `use_github`
- *(query)* Rename `in_` to `isin` for `C` object

## [0.0.4] - 2024-10-11

### 🚀 Features
Expand Down Expand Up @@ -119,6 +139,8 @@ Initial implementation of the lyDATA library.
<!-- generated by git-cliff -->
<!-- markdownlint-disable-file MD024 -->

[0.1.0]: https://github.com/rmnldwg/lydata/compare/0.0.4..0.1.0
[0.0.4]: https://github.com/rmnldwg/lydata/compare/0.0.3..0.0.4
[0.0.3]: https://github.com/rmnldwg/lydata/compare/0.0.2..0.0.3
[0.0.2]: https://github.com/rmnldwg/lydata/compare/0.0.1..0.0.2
[0.0.1]: https://github.com/rmnldwg/lydata/compare/63b2d867888aa8f583c498ff3fc3f94cdb48765c..0.0.1
Expand Down
2 changes: 2 additions & 0 deletions lydata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
join_datasets,
load_datasets,
)
from lydata.utils import enhance
from lydata.validator import validate_datasets

__author__ = "Roman Ludwig"
Expand All @@ -24,6 +25,7 @@
"join_datasets",
"load_datasets",
"validate_datasets",
"enhance",
]

logger = logging.getLogger(__name__)
Expand Down
22 changes: 16 additions & 6 deletions lydata/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
)
from lydata.validator import construct_schema

warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


def _get_all_true(df: pd.DataFrame) -> pd.Series:
"""Return a mask with all entries set to ``True``."""
Expand Down Expand Up @@ -293,10 +295,10 @@ def __ne__(self, value: Any) -> Q:
"""
return Q(self.column, "!=", value)

def in_(self, value: list[Any]) -> Q:
def isin(self, value: list[Any]) -> Q:
"""Create a query object for checking if the column values are in a list.
>>> C('foo').in_([1, 2, 3])
>>> C('foo').isin([1, 2, 3])
Q('foo', 'in', [1, 2, 3])
"""
return Q(self.column, "in", value)
Expand Down Expand Up @@ -756,8 +758,9 @@ def infer_sublevels(
"V": ["a", "b"],
}
The resulting DataFrame will only contain the newly inferred sublevel columns.
Thus, one can simply :py:meth:`~pandas.DataFrame.update` the original DataFrame
The resulting DataFrame will only contain the newly inferred sublevel columns
and only for those sublevels that were not already present in the DataFrame.
Thus, one can simply :py:meth:`~pandas.DataFrame.join` the original DataFrame
with the result.
>>> df = pd.DataFrame({
Expand Down Expand Up @@ -795,6 +798,9 @@ def infer_sublevels(

for subid in subids:
sublevel = superlevel + subid
if sublevel in self._obj[modality, side]:
continue

result.loc[is_healthy, (modality, side, sublevel)] = False
result.loc[~is_healthy, (modality, side, sublevel)] = None

Expand All @@ -815,8 +821,9 @@ def infer_superlevels(
The superlevel's status is computed for the specified ``modalities``. If and
what sublevels a superlevel has, is specified in ``subdivisions``.
The resulting DataFrame will only contain the newly inferred superlevel columns.
This way, it is straightforward to :py:meth:`~pandas.DataFrame.update` the
The resulting DataFrame will only contain the newly inferred superlevel columns
and only for those superlevels that were not already present in the DataFrame.
This way, it is straightforward to :py:meth:`~pandas.DataFrame.join` it with the
original DataFrame.
>>> df = pd.DataFrame({
Expand Down Expand Up @@ -857,6 +864,9 @@ def infer_superlevels(
except KeyError:
continue

if superlevel in self._obj[modality, side]:
continue

result.loc[are_all_healthy, (modality, side, superlevel)] = False
result.loc[is_any_involved, (modality, side, superlevel)] = True
result.loc[is_unknown, (modality, side, superlevel)] = None
Expand Down
36 changes: 18 additions & 18 deletions lydata/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,12 @@ def get_description(self) -> str:

def load(
self,
skip_disk: bool = False,
use_github: bool = False,
**load_kwargs,
) -> pd.DataFrame:
"""Load the ``data.csv`` file from disk or from GitHub.
One can also choose to ``skip_disk``. Any keyword arguments are passed to
One can also choose to ``use_github``. Any keyword arguments are passed to
:py:func:`pandas.read_csv`.
The method will store the output of :py:meth:`~pydantic.BaseModel.model_dump`
Expand All @@ -144,15 +144,15 @@ def load(
>>> df_from_disk = conf.load()
>>> df_from_disk.shape
(263, 82)
>>> df_from_github = conf.load(skip_disk=True)
>>> df_from_github = conf.load(use_github=True)
>>> np.all(df_from_disk.fillna(0) == df_from_github.fillna(0))
np.True_
"""
kwargs = {"header": [0, 1, 2]}
kwargs.update(load_kwargs)

try:
if skip_disk:
if use_github:
logger.info(f"Skipping loading from {self.path}.")
raise SkipDiskError
df = pd.read_csv(self.path, **kwargs)
Expand Down Expand Up @@ -273,7 +273,7 @@ def available_datasets(
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
skip_disk: bool = False,
use_github: bool = False,
repo: str = _repo,
ref: str = "main",
) -> Generator[LyDatasetConfig, None, None]:
Expand All @@ -288,17 +288,17 @@ def available_datasets(
the parent directory of the directory containing this file. If the library is
installed, this will be the ``site-packages`` directory.
With ``skip_disk`` set to ``True``, the function will not look for datasets on disk,
but will instead look for them on GitHub. The ``repo`` and ``ref`` arguments can be
used to specify the repository and the branch/tag/commit to look in.
With ``use_github`` set to ``True``, the function will not look for datasets on
disk, but will instead look for them on GitHub. The ``repo`` and ``ref`` arguments
can be used to specify the repository and the branch/tag/commit to look in.
>>> avail_gen = available_datasets()
>>> sorted([ds.name for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['2021-clb-oropharynx',
'2021-usz-oropharynx',
'2023-clb-multisite',
'2023-isb-multisite']
>>> avail_gen = available_datasets(skip_disk=True)
>>> avail_gen = available_datasets(use_github=True)
>>> sorted([ds.name for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['2021-clb-oropharynx',
'2021-usz-oropharynx',
Expand All @@ -307,15 +307,15 @@ def available_datasets(
>>> avail_gen = available_datasets(
... institution="hvh",
... ref="6ac98d",
... skip_disk=True,
... use_github=True,
... )
>>> sorted([ds.get_url("") for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['https://raw.githubusercontent.com/rmnldwg/lydata/6ac98d/2024-hvh-oropharynx/']
"""
if not skip_disk:
if not use_github:
if repo != _repo or ref != "main":
warnings.warn(
"Parameters `repo` and `ref` are ignored, unless `skip_disk` "
"Parameters `repo` and `ref` are ignored, unless `use_github` "
"is set to `True`."
)
yield from _available_datasets_on_disk(
Expand All @@ -339,7 +339,7 @@ def load_datasets(
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
skip_disk: bool = False,
use_github: bool = False,
repo: str = _repo,
ref: str = "main",
**kwargs,
Expand All @@ -355,20 +355,20 @@ def load_datasets(
institution=institution,
subsite=subsite,
search_paths=search_paths,
skip_disk=skip_disk,
use_github=use_github,
repo=repo,
ref=ref,
)
for dset_conf in dset_confs:
yield dset_conf.load(skip_disk=skip_disk, **kwargs)
yield dset_conf.load(use_github=use_github, **kwargs)


def join_datasets(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
search_paths: list[Path] | None = None,
skip_disk: bool = False,
use_github: bool = False,
repo: str = _repo,
ref: str = "main",
**kwargs,
Expand All @@ -381,15 +381,15 @@ def join_datasets(
>>> join_datasets(year="2023").shape
(705, 219)
>>> join_datasets(year="2023", skip_disk=True).shape
>>> join_datasets(year="2023", use_github=True).shape
(705, 219)
"""
gen = load_datasets(
year=year,
institution=institution,
subsite=subsite,
search_paths=search_paths,
skip_disk=skip_disk,
use_github=use_github,
repo=repo,
ref=ref,
**kwargs,
Expand Down
43 changes: 43 additions & 0 deletions lydata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,49 @@ def get_default_modalities() -> dict[str, ModalityConfig]:
}


def enhance(
dataset: pd.DataFrame,
infer_sublevels_kwargs: dict[str, Any] | None = None,
infer_superlevels_kwargs: dict[str, Any] | None = None,
combine_kwargs: dict[str, Any] | None = None,
) -> pd.DataFrame:
"""Enhance the dataset by inferring additional columns from the data.
This performs the following steps in order:
1. Infer the superlevel involvement for each diagnostic modality using the
:py:meth:`~lydata.accessor.LyDataAccessor.infer_superlevels` method.
2. Infer the sublevel involvement for each diagnostic modality using the
:py:meth:`~lydata.accessor.LyDataAccessor.infer_sublevels` method. This skips
all LNLs that were computed in the previous step.
3. Compute the maximum likelihood estimate of the true state of the patient using
the :py:meth:`~lydata.accessor.LyDataAccessor.combine`.
.. important::
Performing these operations in any other order may lead to the loss of some
information or even to conflicting LNL involvement information.
The result contains all LNLs of interest in the head and neck region, as well as
the best estimate of the true state of the patient under the top-level key
``max_llh``.
"""
infer_sublevels_kwargs = infer_sublevels_kwargs or {}
infer_superlevels_kwargs = infer_superlevels_kwargs or {}
combine_kwargs = combine_kwargs or {}

result = dataset.copy()

result = result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs))
result = result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs))

max_llh = pd.concat(
{"max_llh": result.ly.combine(**combine_kwargs)},
axis="columns",
)
return result.join(max_llh)


def _main() -> None:
"""Run the main function."""
...
Expand Down
4 changes: 2 additions & 2 deletions lydata/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def validate_datasets(
year: int | str = "*",
institution: str = "*",
subsite: str = "*",
skip_disk: bool = False,
use_github: bool = False,
repo: str = "rmnldwg/lydata",
ref: str = "main",
**kwargs,
Expand All @@ -140,7 +140,7 @@ def validate_datasets(
year=year,
institution=institution,
subsite=subsite,
skip_disk=skip_disk,
use_github=use_github,
repo=repo,
ref=ref,
):
Expand Down

0 comments on commit 3bbb3c4

Please sign in to comment.